{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Check Phrasal paper to see how they split the corpus\n",
    "# they only use en-fr from autodesk, not en-de\n",
    "# according to the Autodesk README, there can be duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import bz2\n",
    "\n",
    "# GERMAN\n",
    "# autodesk_file = '/media/1tb_drive/parallel_data/autodesk/deu.mt.bz2'\n",
    "\n",
    "# FRENCH\n",
    "# autodesk_file = '/media/1tb_drive/parallel_data/autodesk/fra.mt.bz2'\n",
    "\n",
    "# PORTUGUESE\n",
    "autodesk_file = '/media/1tb_drive/parallel_data/autodesk/ptg.mt.bz2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with bz2.BZ2File(autodesk_file) as inp:\n",
    "    autodesk_rows = inp.read().strip().decode('utf8').split('\\n')\n",
    "    \n",
    "autodesk_rows_cols = [l.split(u'\\uf8ff') for l in autodesk_rows]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from subprocess import Popen, PIPE\n",
    "\n",
    "# tokenization funcs\n",
    "SOURCE_LANG = 'en'\n",
    "# TARGET_LANG = 'de'\n",
    "# TARGET_LANG = 'fr'\n",
    "TARGET_LANG = 'pt'\n",
    "\n",
    "tokenize_script = '/media/1tb_drive/parallel_data/en-de/chris_en-de_big_corpus/train/processed/tokenizer.perl'\n",
    "source_tokenizer_cmd = [tokenize_script, '-l', SOURCE_LANG, '-q', '-', '-no-escape', '1']\n",
    "target_tokenizer_cmd = [tokenize_script, '-l', TARGET_LANG, '-q', '-', '-no-escape', '1']\n",
    "\n",
    "# NOTE: it's much slower to create a new tokenizer every time, see entity linking for persistent tokenizer\n",
    "def tokenize(segment, src_trg):\n",
    "    if src_trg == 'source':\n",
    "        source_tokenizer = Popen(source_tokenizer_cmd, stdin=PIPE, stdout=PIPE)\n",
    "        segment, _ = source_tokenizer.communicate(segment.encode('utf-8'))\n",
    "    else:\n",
    "        target_tokenizer = Popen(target_tokenizer_cmd, stdin=PIPE, stdout=PIPE)\n",
    "        segment, _ = target_tokenizer.communicate(segment.encode('utf-8'))\n",
    "    \n",
    "    segment = segment.strip().decode('utf-8')\n",
    "    return segment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# BPE encoding func\n",
    "from subword_nmt.apply_bpe import BPE\n",
    "\n",
    "# BPE_CODE_FILE = '/media/1tb_drive/parallel_data/en-de/chris_en-de_big_corpus/train/processed/all_text_both_EN_and_DE.79000.bpe.codes'\n",
    "# BPE_CODE_FILE = '/media/1tb_drive/parallel_data/en-fr/phrasal_acl/all_text_both_EN_and_FR.79000.bpe.codes'\n",
    "BPE_CODE_FILE = '/media/1tb_drive/parallel_data/en-pt/all_text_both_EN_and_PT.80000.bpe.codes'\n",
    "\n",
    "\n",
    "bpe_codes = open(BPE_CODE_FILE)\n",
    "\n",
    "bpe_encoder = BPE(bpe_codes)\n",
    "def bpe_encode(text):\n",
    "    return bpe_encoder.segment(text)\n",
    "\n",
    "# python apply_bpe.py -c  < /home/chris/projects/neural_qe/data/wmt16/task2_en-de_training/train.src > /home/chris/projects/neural_qe/data/wmt16/task2_en-de_training/train.src.bpe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(autodesk_rows_cols[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'The absolute coordinates of your pointing device is updated continuously and is displayed in the status bar',\n",
       " u'As coordenadas absolutas de seu dispositivo apontador s\\xe3o atualizadas continuamente e \\xe9 exibido na barra de status',\n",
       " u'As coordenadas absolutas de seu dispositivo apontador s\\xe3o atualizadas continuamente e exibidas na barra de status.',\n",
       " u'ACD',\n",
       " u'2014',\n",
       " u'MT',\n",
       " u'0.000',\n",
       " u'58',\n",
       " u'',\n",
       " u'2012/11/19 19:24:51',\n",
       " u'\\u25ca\\xf7']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "autodesk_rows_cols[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# columns according to README\n",
    "# 1) EN-US source segment\n",
    "# 2) target-language raw MT output (for .mt.bz2 files) or raw TM output (for .tm.bz2 files)\n",
    "# 3) final post-edited target-language segment\n",
    "# 4) Autodesk product code\n",
    "# 5) product release identifier\n",
    "# 6) translation type (FUZZY or MT)\n",
    "# 7) raw MT score from Moses (with unknown-word penalties discarded)\n",
    "# 8) TM fuzzy match score\n",
    "# 9) TMX-style XML-encapsulated placeholder content (<phs>…</phs>)\n",
    "# 10) entry creation date\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "raw_sources, raw_hyps, raw_refs, product_codes, release_identifiers, trans_types, mt_scores, fuzzy_matches, placeholders, creation_dates, _ = zip(*autodesk_rows_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "79608"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(autodesk_rows_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "79608"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(raw_sources)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import OrderedDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "no_dups = OrderedDict(zip(raw_sources, raw_refs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "66243"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(no_dups)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "src_refs = no_dups.items()\n",
    "random.seed(37)\n",
    "random.shuffle(src_refs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_src_refs = src_refs[:-1000]\n",
    "dev_src_refs = src_refs[-1000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_sources, train_refs = zip(*train_src_refs)\n",
    "dev_sources, dev_refs = zip(*dev_src_refs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "80721"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_sources)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load big corpus, get noun chunks across the whole language, these are the general corpus frequency stats\n",
    "# we want to compare these freqs to the target domain to see what's actually a term"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import spacy\n",
    "en_nlp = spacy.load('en')\n",
    "de_nlp = spacy.load('de')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "en_doc = en_nlp(u'Hello, world. Here are two sentences.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tc = [c for c in en_doc.noun_chunks]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "c1 = tc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'two sentences'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c1.orth_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processed 0 segments\n",
      "processed 5000 segments\n",
      "processed 10000 segments\n",
      "processed 15000 segments\n",
      "processed 20000 segments\n",
      "processed 25000 segments\n",
      "processed 30000 segments\n",
      "processed 35000 segments\n",
      "processed 40000 segments\n",
      "processed 45000 segments\n",
      "processed 50000 segments\n",
      "processed 55000 segments\n",
      "processed 60000 segments\n",
      "processed 65000 segments\n",
      "processed 70000 segments\n",
      "processed 75000 segments\n",
      "processed 80000 segments\n",
      "processed 85000 segments\n",
      "processed 90000 segments\n",
      "processed 95000 segments\n",
      "processed 100000 segments\n",
      "processed 105000 segments\n",
      "processed 0 segments\n",
      "processed 5000 segments\n",
      "processed 10000 segments\n",
      "processed 15000 segments\n",
      "processed 20000 segments\n",
      "processed 25000 segments\n",
      "processed 30000 segments\n",
      "processed 35000 segments\n",
      "processed 40000 segments\n",
      "processed 45000 segments\n",
      "processed 50000 segments\n",
      "processed 55000 segments\n",
      "processed 60000 segments\n",
      "processed 65000 segments\n",
      "processed 70000 segments\n",
      "processed 75000 segments\n",
      "processed 80000 segments\n",
      "processed 85000 segments\n",
      "processed 90000 segments\n",
      "processed 95000 segments\n",
      "processed 100000 segments\n",
      "processed 105000 segments\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# de_doc = de_nlp(u'ich bin ein Berliner.')\n",
    "\n",
    "source_chunks = []\n",
    "ref_chunks = []\n",
    "source_iter = (l for l in train_sources)\n",
    "ref_iter = (l for l in train_refs)\n",
    "\n",
    "for i, doc in enumerate(en_nlp.pipe(source_iter, batch_size=50, n_threads=10)):\n",
    "    source_chunks.append([c.orth_ for c in doc.noun_chunks])\n",
    "    if i % 5000 == 0:\n",
    "        print('processed {} segments'.format(i))\n",
    "    \n",
    "for i, doc in enumerate(en_nlp.pipe(ref_iter, batch_size=50, n_threads=10)):\n",
    "    ref_chunks.append([c.orth_ for c in doc.noun_chunks])\n",
    "    if i % 5000 == 0:\n",
    "        print('processed {} segments'.format(i))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict, Counter\n",
    "\n",
    "chunk_map = defaultdict(Counter)\n",
    "source_chunk_occs = Counter()\n",
    "target_chunk_occs = Counter()\n",
    "\n",
    "for src_chunks, trg_chunks in zip(source_chunks, ref_chunks):\n",
    "    for src_chunk in set(src_chunks):\n",
    "        chunk_map[src_chunk].update(set(trg_chunks))\n",
    "        \n",
    "    source_chunk_occs.update(src_chunks)\n",
    "    target_chunk_occs.update(trg_chunks)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "assert len(source_chunks) == len(ref_chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_segments = float(len(source_chunks))\n",
    "\n",
    "src_priors = OrderedDict([(k, c / num_segments) for k,c in source_chunk_occs.most_common()])\n",
    "trg_priors = OrderedDict([(k, c / num_segments) for k,c in target_chunk_occs.most_common()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "src_posteriors = {}\n",
    "for src_chunk, trg_chunk_counter in chunk_map.items():\n",
    "    total_occs = float(source_chunk_occs[src_chunk])\n",
    "    posteriors = OrderedDict([(k, v / total_occs) for k, v in trg_chunk_counter.most_common()])\n",
    "    src_posteriors[src_chunk] = posteriors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# terminology should be ranked by how frequent it is in the target domain vs how frequent it is generally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# filter keys and values to find good term pairs\n",
    "# filtered_src = [c for c,v in src_priors.items() if len(c.split()) > 1 and any([w[0].isupper() for w in c.split()])]\n",
    "filtered_src = [c for c,v in src_priors.items() if any([w[0].isupper() for w in c.split()])]\n",
    "\n",
    "filtered_trg = [c for c,v in trg_priors.items() if len(c.split()) > 1 and any([w[0].isupper() for w in c.split()])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# filtered_posteriors = [(k, [(t, v) for t, v in src_posteriors[k].items()\n",
    "#                             if len(t.split()) > 1 and all([w[0].isupper() for w in t.split()])]) for k in filtered_src]\n",
    "\n",
    "filtered_posteriors = [(k, [(t, v) for t, v in src_posteriors[k].items()\n",
    "                            if any([w[0].isupper() for w in t.split()])]) for k in filtered_src]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "60933"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered_posteriors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "term_pairs = [(k,v[0]) for k,v in filtered_posteriors \n",
    "              if len(v) > 0  \n",
    "              and v[0][1] > 0.7\n",
    "              and not abs(len(k) - len(v[0][0])) > 8\n",
    "              and source_chunk_occs[k] > 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "532"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(term_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(u'U-Value', (u'U-Wert', 1.0)),\n",
       " (u'Video', (u'Video', 0.9579439252336449)),\n",
       " (u'CivilSales', (u'CivilSales', 1.0)),\n",
       " (u'Step', (u'Schritt', 0.7431192660550459)),\n",
       " (u'Frame construction', (u'Rahmenkonstruktion', 0.9888888888888889)),\n",
       " (u'An issue', (u'Ein Problem', 0.8571428571428571)),\n",
       " (u'Block', (u'Block', 0.9838709677419355)),\n",
       " (u'Demonstration', (u'Pr\\xe4sentation', 0.9310344827586207)),\n",
       " (u'Brick', (u'Ziegel', 1.0)),\n",
       " (u'Knowledge Check', (u'Wissens-Check', 0.7727272727272727)),\n",
       " (u'Passive floor', (u'Passive Geschossdecke', 0.9428571428571428)),\n",
       " (u'All rights', (u'All rights', 0.8823529411764706)),\n",
       " (u'the In Canvas Tools', (u'Canvas Tools', 0.9696969696969697)),\n",
       " (u'Raster menu', (u'Men\\xfc Raster', 0.7586206896551724)),\n",
       " (u'Connectivity', (u'Connectivity', 0.8571428571428571)),\n",
       " (u'Bridge', (u'Bridge', 0.72)),\n",
       " (u'Membrane', (u'Folie', 0.9583333333333334)),\n",
       " (u'Microsoft Windows', (u'Microsoft Windows', 0.8260869565217391)),\n",
       " (u'VIDEO', (u'VIDEO', 0.8181818181818182)),\n",
       " (u'ProdSuppBestPrac', (u'ProdSuppBestPrac', 1.0)),\n",
       " (u'Question', (u'Frage', 0.8571428571428571)),\n",
       " (u'Option', (u'Option', 0.8571428571428571)),\n",
       " (u'{2}Autodesk{3}\\xae{4} Revit{5}\\xae{6',\n",
       "  (u'{1}Wir bedanken uns, dass Sie', 0.7142857142857143)),\n",
       " (u'Copyright', (u'Copyright', 1.0)),\n",
       " (u'Build', (u'Option Build', 0.8421052631578947)),\n",
       " (u'Answer', (u'Antwort', 0.8888888888888888)),\n",
       " (u'the USA', (u'Autodesk', 1.0)),\n",
       " (u'Intro', (u'Einf\\xfchrung', 0.9411764705882353)),\n",
       " (u'Introduction', (u'Einf\\xfchrung', 0.8823529411764706)),\n",
       " (u'TIP', (u'TIPP', 1.0)),\n",
       " (u'VaultPO.Intro', (u'VaultPO.Intro', 1.0)),\n",
       " (u'Video Library Catalog', (u'Videobibliothek Nr', 1.0)),\n",
       " (u'BIMStory', (u'BIMStory', 1.0)),\n",
       " (u'NOTE', (u'ANMERKUNG', 0.75)),\n",
       " (u'Scene menu', (u'Men\\xfc Szene', 0.875)),\n",
       " (u'ReCapPO.Intro', (u'ReCapPO.Intro', 1.0)),\n",
       " (u'CivilSalesRail', (u'CivilSalesRail', 1.0)),\n",
       " (u'Autodesk Data Management Server',\n",
       "  (u'Autodesk Data Management Server', 0.9333333333333333)),\n",
       " (u'Autodesk, Inc.', (u'Autodesk, Inc.', 0.8666666666666667)),\n",
       " (u'KC.WrkFlwAuto', (u'KC.WrkFlwAuto', 1.0)),\n",
       " (u'Autodesk{1}\\xae', (u'Autodesk{1}\\xae', 0.7333333333333333)),\n",
       " (u'CivilSalesRoads', (u'CivilSalesRoads', 1.0)),\n",
       " (u'Animation menu', (u'Men\\xfc Animation', 0.7142857142857143)),\n",
       " (u'PDM.PK.VltDataMgt', (u'PDM.PK.VltDataMgt', 1.0)),\n",
       " (u'AdskStory', (u'AdskStory', 1.0)),\n",
       " (u'Story', (u'AdskStory', 1.0)),\n",
       " (u'Demonstration Table', (u'Pr\\xe4sentationstabelle', 0.8571428571428571)),\n",
       " (u'New Features', (u'Neue Funktionen', 0.8461538461538461)),\n",
       " (u'Materials menu', (u'Men\\xfc Materialien', 0.9230769230769231)),\n",
       " (u'Exposed beams', (u'Frei liegende Tr\\xe4ger', 1.0)),\n",
       " (u'press Enter', (u'EINGABETASTE', 0.8461538461538461)),\n",
       " (u'Specifies', (u'Gibt', 0.8461538461538461)),\n",
       " (u'Tip', (u'Tipp', 0.9230769230769231)),\n",
       " (u'Toggles', (u'Aktiviert bzw', 0.75)),\n",
       " (u'PK.01', (u'PK.01', 1.0)),\n",
       " (u'R-15 cavity insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'\\xa9 Copyright', (u'All rights', 0.9166666666666666)),\n",
       " (u'AutSubsStory', (u'AutSubsStory', 1.0)),\n",
       " (u'R-21 cavity insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'Keyboard', (u'Tastatur', 0.8333333333333334)),\n",
       " (u'AutoIndustryWkflw', (u'AutoIndustryWkflw', 1.0)),\n",
       " (u'KC.Overview', (u'KC.Overview', 1.0)),\n",
       " (u'R-11 cavity insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'CompLandTransp', (u'CompLandTransp', 1.0)),\n",
       " (u'{1}Point Cloud tab{2', (u'Klicken Sie auf', 1.0)),\n",
       " (u'Toggles display', (u'Aktiviert bzw', 1.0)),\n",
       " (u'the Engineering Analysis Community',\n",
       "  (u'the Engineering Analysis Community', 1.0)),\n",
       " (u'French door', (u'Fenstert\\xfcr', 1.0)),\n",
       " (u'Autodesk 3ds Max Design',\n",
       "  (u'Autodesk 3ds Max Design', 0.7272727272727273)),\n",
       " (u'System drive', (u'Systemlaufwerk', 1.0)),\n",
       " (u'Home > Services', (u'Home > Services', 1.0)),\n",
       " (u'ReCap_DigAssetCreateFilmGames', (u'ReCap_DigAssetCreateFilmGames', 1.0)),\n",
       " (u'WorkflowRail', (u'WorkflowRail', 1.0)),\n",
       " (u'the Customizations', (u'Bereich Anpassungen', 0.8)),\n",
       " (u'AdvVaultforMfg', (u'AdvVaultforMfg', 1.0)),\n",
       " (u'Microsoft{1}\\xae{2} Windows{3}\\xae{4',\n",
       "  (u'Microsoft{1}\\xae{2} Windows{3}\\xae{4', 1.0)),\n",
       " (u'IntroAECConceptualDsgn', (u'IntroAECConceptualDsgn', 1.0)),\n",
       " (u'MEP_Dsgn2Fab', (u'KC.Overview', 1.0)),\n",
       " (u'IVSim', (u'IVSim', 1.0)),\n",
       " (u'FactoryEnv_PtCldRevitWflw', (u'FactoryEnv_PtCldRevitWflw', 1.0)),\n",
       " (u'AUTODESK{1}\\xae', (u'AUTODESK{1}\\xae', 0.9)),\n",
       " (u'Repeat steps', (u'Wiederholen Sie', 0.9)),\n",
       " (u'FAQ', (u'H\\xe4ufig', 0.8)),\n",
       " (u'Microsoft{1', (u'Microsoft{1', 0.9)),\n",
       " (u'AdvLayoutDsgnAssetsFactoryDSte', (u'AdvLayoutDsgnAssetsFactoryDSte', 1.0)),\n",
       " (u'IS.Trano', (u'CivilSales', 1.0)),\n",
       " (u'Vinyl', (u'Vinyl', 1.0)),\n",
       " (u'a PDF File', (u'eine PDF-Datei', 0.7777777777777778)),\n",
       " (u'Portions', (u'Portions', 0.7777777777777778)),\n",
       " (u'KC.Demo', (u'KC.Demo', 1.0)),\n",
       " (u'the Automatic Size button',\n",
       "  (u'Schaltfl\\xe4che \"Autom', 0.7777777777777778)),\n",
       " (u'Administration', (u'Verwaltung', 0.8888888888888888)),\n",
       " (u'GB RAM', (u'GB RAM', 0.75)),\n",
       " (u'N', (u'N', 0.75)),\n",
       " (u'Batt', (u'Matte', 1.0)),\n",
       " (u'Demonstration Step Table2', (u'Pr\\xe4sentationsschritttabelle', 1.0)),\n",
       " (u'Demonstration Step Table1', (u'Pr\\xe4sentationsschritttabelle', 0.75)),\n",
       " (u'Demonstration Step Table', (u'Pr\\xe4sentationsschritttabelle', 1.0)),\n",
       " (u'MText', (u'MText', 0.75)),\n",
       " (u'About dialog', (u'Info', 1.0)),\n",
       " (u'Configuration', (u'Konfiguration', 0.875)),\n",
       " (u'WARNING', (u'WARNUNG', 0.875)),\n",
       " (u'LandXML', (u'LandXML', 0.75)),\n",
       " (u'PDM.IK.CompLand', (u'PDM.IK.CompLand', 1.0)),\n",
       " (u'Geometric Design', (u'Geometric Design', 1.0)),\n",
       " (u'Windows Server\\u2122', (u'Windows Server\\u2122', 1.0)),\n",
       " (u'every Autodesk Data Management Server',\n",
       "  (u'Autodesk Data Management Server', 0.7142857142857143)),\n",
       " (u'Inventor sometimes unexpectedly exits',\n",
       "  (u'Inventor wird gelegentlich unerwartet beendet', 0.8571428571428571)),\n",
       " (u'related Microsoft downloads',\n",
       "  (u'Weitere Informationen', 0.7142857142857143)),\n",
       " (u'Outlet', (u'Auslass', 0.8571428571428571)),\n",
       " (u'Damage State', (u'Schadenszustand', 0.7142857142857143)),\n",
       " (u'Video Library Catalog #', (u'Videobibliothek Nr', 1.0)),\n",
       " (u'AutoCAD Basics', (u'AutoCAD', 0.8571428571428571)),\n",
       " (u'Explorer', (u'Explorer', 0.7142857142857143)),\n",
       " (u'Roadways', (u'Stra\\xdfen', 1.0)),\n",
       " (u'Autodesk Mudbox', (u'Autodesk Mudbox', 0.7142857142857143)),\n",
       " (u'6 GB', (u'6 GB', 0.8571428571428571)),\n",
       " (u'October', (u'Oktober', 0.7142857142857143)),\n",
       " (u'Prepare', (u'Vorbereitung', 0.7142857142857143)),\n",
       " (u'Customizations', (u'Bereich Anpassungen', 0.8571428571428571)),\n",
       " (u'Radiant floor', (u'Fu\\xdfbodenheizung', 1.0)),\n",
       " (u'(phrase; R', (u'(phrase; R', 1.0)),\n",
       " (u'the Z-axis', (u'Z-Achse', 0.7142857142857143)),\n",
       " (u'DSE.UI.ShowMessage', (u'DSE.UI.ShowMessage', 1.0)),\n",
       " (u'Essential Skills Video', (u'Einf\\xfchrungsvideo', 1.0)),\n",
       " (u'Dynamo', (u'Dynamo', 0.7142857142857143)),\n",
       " (u'Static Stress', (u'Statische Spannung', 0.7142857142857143)),\n",
       " (u'PlntDsgnOilGas_IndWkFlow', (u'PlntDsgnOilGas_IndWkFlow', 1.0)),\n",
       " (u'PDM.BK.Scenario', (u'PDM.BK.Scenario', 1.0)),\n",
       " (u'Wood frame', (u'Holzrahmen', 0.8333333333333334)),\n",
       " (u'Troubleshooting', (u'Fehlerbehebung', 1.0)),\n",
       " (u'the Annotate panel', (u'Multifunktionsleiste auf', 1.0)),\n",
       " (u'Y1', (u'Y1', 1.0)),\n",
       " (u'Metal frame', (u'Metallrahmen', 0.8333333333333334)),\n",
       " (u'Autodesk{1}\\xae{2} AutoCAD{3}\\xae{4',\n",
       "  (u'Autodesk{1}\\xae{2} AutoCAD{3}\\xae{4', 0.8333333333333334)),\n",
       " (u'Press ESC', (u'Dr\\xfccken Sie', 0.8333333333333334)),\n",
       " (u'This material', (u'Dieses', 0.8333333333333334)),\n",
       " (u'QJson', (u'QJson', 0.8333333333333334)),\n",
       " (u'{1} More Helpers', (u'Men\\xfc Objekte', 1.0)),\n",
       " (u'{1}Appendix A{2', (u'1}Anhang A{2', 0.8333333333333334)),\n",
       " (u'Asphalt', (u'Asphalt', 1.0)),\n",
       " (u'the Palette', (u'der Palette', 0.8333333333333334)),\n",
       " (u'RVT2014-32bit){3}{4', (u'RVT2014-32bit){3}{4', 0.8333333333333334)),\n",
       " (u'{1} Autodesk{2}\\xae{3', (u'{1} Autodesk{2}\\xae{3', 0.8333333333333334)),\n",
       " (u'Metal roof', (u'Blechdach', 1.0)),\n",
       " (u'VF', (u'AF', 1.0)),\n",
       " (u'THIS SOFTWARE', (u'THIS SOFTWARE', 1.0)),\n",
       " (u'Microsoft Internet Explorer', (u'Microsoft Internet Explorer', 1.0)),\n",
       " (u'This product', (u'This product', 1.0)),\n",
       " (u'This Readme', (u'Hilfe', 1.0)),\n",
       " (u'the {1}Asset Builder{2} tab', (u'Klicken Sie auf der Registerkarte', 1.0)),\n",
       " (u'MCCw360G.Tech', (u'MCCw360G.Tech', 1.0)),\n",
       " (u'Redistributions', (u'Redistributions', 1.0)),\n",
       " (u'Microsoft SQL Server\\xae', (u'Microsoft SQL Server\\xae', 1.0)),\n",
       " (u'Wyoming', (u'Wyoming', 1.0)),\n",
       " (u'this Autodesk software', (u'this Autodesk software', 1.0)),\n",
       " (u'Modify Properties', (u'\\xe4ndern Sie', 1.0)),\n",
       " (u'Autodesk Robot Structural Analysis',\n",
       "  (u'Autodesk Robot Structural Analysis', 0.8333333333333334)),\n",
       " (u'KC.Tools', (u'KC.Tools', 1.0)),\n",
       " (u'Fabrication CADmep', (u'Fabrication CADmep', 0.8333333333333334)),\n",
       " (u'MATERIAL', (u'Element von', 0.8)),\n",
       " (u'Hashin', (u'Hashin', 0.8)),\n",
       " (u'Sculpt', (u'Formen', 0.8)),\n",
       " (u'Release Notes - Autodesk Inventor LT',\n",
       "  (u'Versionshinweise f\\xfcr Autodesk Inventor LT', 0.8)),\n",
       " (u'PDM.SK.Scenario', (u'PDM.SK.Scenario', 1.0)),\n",
       " (u'Rotational', (u'Drehbar', 0.8)),\n",
       " (u'{1} Helpers', (u'Men\\xfc Objekte', 0.8)),\n",
       " (u'New File Default Template', (u'Neue Datei - Standardvorlage', 1.0)),\n",
       " (u'Design Study Manager', (u'Designstudien-Manager', 1.0)),\n",
       " (u'{1} Tools', (u'Men\\xfc Materialien', 0.8)),\n",
       " (u'oc SIP wall frame', (u'Polystyrol', 0.8)),\n",
       " (u'Microsoft\\xae Windows\\xae', (u'Microsoft\\xae Windows\\xae', 0.8)),\n",
       " (u'Press Esc', (u'Dr\\xfccken Sie', 0.8)),\n",
       " (u'CMU wall', (u'Zoll CMU-Wand', 1.0)),\n",
       " (u'Microsoft Windows Server', (u'Microsoft Windows Server', 1.0)),\n",
       " (u'PK.02', (u'PK.02', 1.0)),\n",
       " (u'the {1}BOM Layout{2} page', (u'F\\xfchren Sie auf der Seite', 0.8)),\n",
       " (u'Demonstration Information', (u'Pr\\xe4sentationsinformationen', 1.0)),\n",
       " (u'Autodesk VRED', (u'Autodesk VRED', 0.8)),\n",
       " (u'KC.Introduction', (u'KC.Introduction', 1.0)),\n",
       " (u'Autodesk Vault File Server', (u'Autodesk Vault File Server', 0.8)),\n",
       " (u'R-5 board insulation', (u'Rahmenkonstruktion', 0.8)),\n",
       " (u'Start menu', (u'Men\\xfc Start', 0.8)),\n",
       " (u'Light plaster', (u'Leichtputz', 1.0)),\n",
       " (u'Ballast', (u'Ballast', 0.8)),\n",
       " (u'R-3 board insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'R-2 board insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'Label Styles{2}.', (u'Weitere Informationen', 0.8)),\n",
       " (u'Multi Site', (u'Multi Site', 0.8)),\n",
       " (u'Autodesk Revit{2', (u'Autodesk Revit{2', 1.0)),\n",
       " (u'This content', (u'Juli', 0.8)),\n",
       " (u'Microsoft{1}\\xae{2} Windows{3}\\xae{4} 7 Home Premium',\n",
       "  (u'Microsoft{1}\\xae{2} Windows{3}\\xae{4} 7 Home Premium', 1.0)),\n",
       " (u'Supported versions', (u'Unterst\\xfctzte Versionen', 1.0)),\n",
       " (u'Autodesk AutoCAD{1}{2', (u'Systemvoraussetzungen', 1.0)),\n",
       " (u'Polymers', (u'Polymers', 1.0)),\n",
       " (u'UrgentIssues', (u'ProdSuppBestPrac', 1.0)),\n",
       " (u'This update release addresses issues',\n",
       "  (u'Autodesk direkt hingewiesen wurde', 1.0)),\n",
       " (u'Index', (u'Index', 0.8)),\n",
       " (u'January', (u'Januar', 0.8)),\n",
       " (u'X1', (u'X1', 0.8)),\n",
       " (u'November', (u'November', 0.8)),\n",
       " (u'Inventor LT sometimes unexpectedly exits',\n",
       "  (u'Inventor LT wird gelegentlich unerwartet beendet', 0.8)),\n",
       " (u'R-6 board insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'the {3}BOM Views{4} panel', (u'F\\xfchren Sie auf der Seite', 0.8)),\n",
       " (u'the ANSYS Toolbar', (u'Schaltfl\\xe4che', 0.8)),\n",
       " (u'the DWG file\\u2019s associated project file',\n",
       "  (u'der DWG-Datei zugeordnete Projektdatei', 0.8)),\n",
       " (u'PDM.IK.IndMachWrkflow', (u'PDM.IK.IndMachWrkflow', 1.0)),\n",
       " (u'The storyboard', (u'Das Drehbuch zeigt', 0.8)),\n",
       " (u'Synthetic stucco', (u'Synthetischer Putz', 0.8)),\n",
       " (u'} \\xa9 Copyright', (u'All rights', 1.0)),\n",
       " (u'your Infrastructure Model', (u'your Infrastructure Model', 1.0)),\n",
       " (u'GetRdy', (u'GetRdy', 1.0)),\n",
       " (u'PDM.IK.IndMachStratOvr', (u'PDM.IK.IndMachStratOvr', 1.0)),\n",
       " (u'{1}{2}Autodesk Revit Structure', (u'{1}{2}Autodesk Revit Structure', 0.8)),\n",
       " (u'ProdNameLong =', (u'ProdNameLong =', 0.8)),\n",
       " (u'{1}, Edit', (u'Sie', 1.0)),\n",
       " (u'R-4 board insulation', (u'Rahmenkonstruktion', 1.0)),\n",
       " (u'Press Enter', (u'Dr\\xfccken Sie', 1.0)),\n",
       " (u'Autodesk\\xae Revit\\xae{2', (u'Autodesk\\xae Revit\\xae{2', 1.0)),\n",
       " (u'Task 1 - Create', (u'Aufgabe', 1.0)),\n",
       " (u'Smart', (u'Intelligent', 0.75)),\n",
       " (u'August', (u'August', 1.0)),\n",
       " (u'POINTCLOUDCLIP', (u'Geben Sie', 0.75)),\n",
       " (u'the Results form', (u'Formular Results auf', 0.75)),\n",
       " (u'the Linux system', (u'erstellen Sie', 1.0)),\n",
       " (u'the resource URL', (u'verwenden Sie', 1.0)),\n",
       " (u'the same Windows', (u'Problem zu vermeiden', 1.0)),\n",
       " (u'Wall:Wall', (u'Wand:Wand', 0.75)),\n",
       " (u'Action Macros', (u'Aktionsmakros', 0.75)),\n",
       " (u'the default Help preference settings',\n",
       "  (u'vorgabem\\xe4\\xdfigen Hilfeeinstellungen zu \\xe4ndern', 0.75)),\n",
       " (u'an NX file', (u'Datei', 1.0)),\n",
       " (u'CATIA', (u'CATIA', 0.75)),\n",
       " (u'AISC', (u'AISC', 1.0)),\n",
       " (u'ALIAS DESIGN', (u'ALIAS DESIGN', 1.0)),\n",
       " (u'Solid Edge', (u'Solid Edge', 1.0)),\n",
       " (u'Supported Versions', (u'Unterst\\xfctzte Versionen', 1.0)),\n",
       " (u'PO.NwxM.KC.Over', (u'PO.NwxM.KC.Over', 1.0)),\n",
       " (u'PDM.IK.AutoSupStratOvr', (u'PDM.IK.AutoSupStratOvr', 1.0)),\n",
       " (u'the Edge', (u'Schritt', 0.75)),\n",
       " (u'Offline help', (u'Offline-Hilfe', 0.75)),\n",
       " (u'Autodesk Sync', (u'das Service Pack', 1.0)),\n",
       " (u'Architects', (u'Architekten', 0.75)),\n",
       " (u'Autodesk Revit Update Release', (u'Autodesk Revit Update Release', 1.0)),\n",
       " (u'this SP2', (u'Inventive{3', 1.0)),\n",
       " (u'Brick cavity', (u'Leichtputz', 0.75)),\n",
       " (u'Wood panel', (u'T\\xfcrf\\xfcllung Holz', 0.75)),\n",
       " (u'Charts', (u'Berichte', 0.75)),\n",
       " (u'Floor:Floor', (u'Geschossdecke', 1.0)),\n",
       " (u'Timber flooring', (u'Fu\\xdfbodenbelag Holz', 1.0)),\n",
       " (u'Autodesk Mockup', (u'Autodesk Mockup', 0.75)),\n",
       " (u'PK.03', (u'PK.03', 1.0)),\n",
       " (u'PK.05', (u'PK.05', 1.0)),\n",
       " (u'an Autodesk deployment', (u'Informationen dazu', 0.75)),\n",
       " (u'Building design', (u'Geb\\xe4udeplanung', 0.75)),\n",
       " (u'Rotate X', (u'Seite', 0.75)),\n",
       " (u'your new Beta serial number', (u'neue Seriennummer f\\xfcr', 1.0)),\n",
       " (u'Autodesk DWG TrueView', (u'Autodesk DWG TrueView', 0.75)),\n",
       " (u'Locking', (u'Sperren', 0.75)),\n",
       " (u'2 GB RAM', (u'2 GB RAM', 0.75)),\n",
       " (u'the Autodesk 360 Mobile app', (u'Autodesk 360 Mobile-App', 0.75)),\n",
       " (u'TO', (u'TO', 1.0)),\n",
       " (u'Snippets', (u'Snippets', 1.0)),\n",
       " (u'E{1}22{2', (u'E{1}22{2', 1.0)),\n",
       " (u'G{1}12{2', (u'G{1}12{2', 1.0)),\n",
       " (u'Design Guide', (u'Design Guide', 1.0)),\n",
       " (u'The {1}Add Items', (u'Das Fenster', 1.0)),\n",
       " (u'The first value', (u'Der erste Wert', 0.75)),\n",
       " (u'White', (u'White', 0.75)),\n",
       " (u'the Text Editor contextual tab',\n",
       "  (u'Werkzeugkasten Textformatierung', 0.75)),\n",
       " (u'Dassault', (u'ANSYS', 0.75)),\n",
       " (u'Professional editions', (u'Autodesk Vault Workgroup', 1.0)),\n",
       " (u'A transition', (u'Ein \\xdcbergang', 0.75)),\n",
       " (u'Deutsch', (u'Deutsch', 1.0)),\n",
       " (u'Data Connect{2}.', (u'Datenverbindung{2}.', 0.75)),\n",
       " (u'White Belt Product Design', (u'White Belt-Spezialisierung auf', 0.75)),\n",
       " (u'Boundaries', (u'Begrenzungen', 0.75)),\n",
       " (u'an Asset', (u'Objekts', 0.75)),\n",
       " (u'the Parameters', (u'Inventor', 0.75)),\n",
       " (u'AutoCAD{2}\\xae{3', (u'AutoCAD{2}\\xae{3', 0.75)),\n",
       " (u'1}One Side{2', (u'1}Eine Seite{2', 1.0)),\n",
       " (u'{1} SpaceWarps', (u'Men\\xfc Objekte', 0.75)),\n",
       " (u'the Search', (u'Datei', 1.0)),\n",
       " (u'SAT', (u'SAT', 0.75)),\n",
       " (u'the Job Manager Daemon', (u'the Job Manager Daemon', 1.0)),\n",
       " (u'Autodesk AutoCAD MEP', (u'Autodesk AutoCAD MEP', 1.0)),\n",
       " (u'the Setup workspace', (u'Arbeitsbereich Einrichtung', 0.75)),\n",
       " (u'Mechanical Engineering', (u'Mechanical Engineering', 0.75)),\n",
       " (u'Autodesk Revit Structure Update Release',\n",
       "  (u'Autodesk Revit Structure Update Release', 1.0)),\n",
       " (u'Symbols', (u'Symbole', 0.75)),\n",
       " (u'PDM.SK.IndMachWrkflow', (u'PDM.SK.IndMachWrkflow', 1.0)),\n",
       " (u'Starts', (u'Startet', 1.0)),\n",
       " (u'NO EVENT', (u'THE USE', 1.0)),\n",
       " (u'Japan', (u'Japan', 0.75)),\n",
       " (u'the ViewCube{2', (u'1}So steuern Sie', 0.75)),\n",
       " (u'PostGIS', (u'PostGIS', 0.75)),\n",
       " (u'1}3D Sketch tab', (u'Klicken Sie', 0.75)),\n",
       " (u'Add file data source', (u'der Palette Data Sources', 1.0)),\n",
       " (u'Autodesk Revit MEP Update Release',\n",
       "  (u'Autodesk Revit MEP Update Release', 1.0)),\n",
       " (u'MERCHANTABILITY', (u'MERCHANTABILITY', 1.0)),\n",
       " (u'the Minimum', (u'Verwenden Sie', 1.0)),\n",
       " (u'Z1', (u'Z1', 0.75)),\n",
       " (u'the Dimension panel', (u'Multifunktionsleiste auf', 1.0)),\n",
       " (u'AS/NZS 3679.1:2010', (u'AS/NZS 3679.1:2010', 0.75)),\n",
       " (u'Toggles visibility', (u'Aktiviert bzw', 1.0)),\n",
       " (u'Garnich', (u'Garnich', 1.0)),\n",
       " (u'Feature Highlights', (u'Feature-Highlights', 1.0)),\n",
       " (u'The downloaded folder', (u'Suite-Arbeitsabl\\xe4ufe', 1.0)),\n",
       " (u'the YZ plane', (u'YZ-Ebene aus', 0.75)),\n",
       " (u'LIABLE', (u'THE USE', 1.0)),\n",
       " (u'an Autodesk Account', (u'Erstellen Sie', 0.75)),\n",
       " (u'What result', (u'Welchen Nutzen', 1.0)),\n",
       " (u'Text Formatting toolbar', (u'Werkzeugkasten Textformatierung', 0.75)),\n",
       " (u'Revisioning Workspaces', (u'Revisionsarbeitsbereichen', 1.0)),\n",
       " (u'The install', (u'Release 2 werden', 0.75)),\n",
       " (u'SelectSingleNode(\"d', (u'SelectSingleNode(\"d', 1.0)),\n",
       " (u'the US', (u'den USA', 0.75)),\n",
       " (u'ACIS', (u'ACIS', 0.75)),\n",
       " (u'Autodesk Simulation DFM', (u'Autodesk Simulation DFM', 0.75)),\n",
       " (u'Robot', (u'Robot', 0.75)),\n",
       " (u'Result file', (u'Ergebnisdatei', 1.0)),\n",
       " (u'Annotate>Dimension', (u'Beschriften>Bema\\xdfung', 1.0)),\n",
       " (u'Msiexec', (u'Msiexec', 1.0)),\n",
       " (u'Apple Safari', (u'Apple Safari', 0.75)),\n",
       " (u'Autodesk Revit LT Update Release',\n",
       "  (u'Autodesk Revit LT Update Release', 1.0)),\n",
       " (u'AutoCAD PDF', (u'AutoCAD PDF', 1.0)),\n",
       " (u'Glossary', (u'Glossar', 0.75)),\n",
       " (u'.NET Framework Version', (u'.NET Framework Version', 1.0)),\n",
       " (u'IK.01', (u'IK.01', 1.0)),\n",
       " (u'the PROPFIND method', (u'Verwenden Sie', 1.0)),\n",
       " (u'Chapter', (u'Kapitel', 1.0)),\n",
       " (u'KNOWN ISSUES', (u'AUTODESK INVENTOR', 0.75)),\n",
       " (u'R-15', (u'R-15', 1.0)),\n",
       " (u'{1}Green Building Studio{2', (u'der Analyse k\\xf6nnen Sie', 0.75)),\n",
       " (u'POINTCLOUDCROP', (u'Geben Sie', 1.0)),\n",
       " (u'1}Two Side{2', (u'1}Zwei Seiten{2', 1.0)),\n",
       " (u'CONTRACT', (u'THE USE', 1.0)),\n",
       " (u'THE USE', (u'THE USE', 1.0)),\n",
       " (u'Your Website', (u'Ihre Website', 0.75)),\n",
       " (u'{1}Autodesk{2} Product', (u'Verteilung von', 1.0)),\n",
       " (u'the Mac App Store version', (u'Einzigartige Funktionen f\\xfcr', 0.75)),\n",
       " (u'a K value', (u'Geben Sie', 0.75)),\n",
       " (u'the GNU Lesser General Public License v.2.1',\n",
       "  (u'the GNU Lesser General Public License v.2.1', 1.0)),\n",
       " (u'PDM.BK.AutoSupStratOvr', (u'PDM.BK.AutoSupStratOvr', 1.0)),\n",
       " (u'Revit{4', (u'Revit{4', 0.75)),\n",
       " (u'component IronPython', (u'Release 2 werden', 0.75)),\n",
       " (u'My Profile', (u'Mein Profil', 0.75)),\n",
       " (u'Autodesk Design Academy', (u'Autodesk Design Academy', 0.75)),\n",
       " (u'Measurements', (u'Messungen', 0.75)),\n",
       " (u'BK.Trano', (u'BK.Trano', 1.0)),\n",
       " (u'Eric Young', (u'Eric Young', 1.0)),\n",
       " (u'Help{4}Help', (u'Sie', 1.0)),\n",
       " (u'Release Notes - Autodesk Inventor View',\n",
       "  (u'Versionshinweise f\\xfcr Autodesk Inventor View', 1.0)),\n",
       " (u'Pro/ENGINEER', (u'Pro/ENGINEER', 0.75)),\n",
       " (u'MS', (u'MS', 1.0)),\n",
       " (u'an ETO project', (u'Das Einrichten', 1.0)),\n",
       " (u'KC.Ecosystem', (u'KC.Ecosystem', 1.0)),\n",
       " (u'the Report Wizard', (u'die Liste', 1.0)),\n",
       " (u'Parameters column', (u'kann der Fl\\xe4chen-Manager', 1.0)),\n",
       " (u'IPT designs', (u'IPT-Konstruktionen', 1.0)),\n",
       " (u'the OpenSSL Project', (u'the OpenSSL Project', 1.0)),\n",
       " (u'Material definitions', (u'der Materialdatei', 1.0)),\n",
       " (u'Mozilla Firefox', (u'Mozilla Firefox', 1.0)),\n",
       " (u'Content Builder', (u'Dienstprogramm f\\xfcr', 1.0)),\n",
       " (u'Autodesk\\xae Infrastructure Design Suite Ultimate',\n",
       "  (u'Autodesk\\xae Infrastructure Design Suite Ultimate', 1.0)),\n",
       " (u'{1}{2}Autodesk{3}\\xae{4} Revit{5}\\xae{6} MEP',\n",
       "  (u'{1}{2}Autodesk{3}\\xae{4} Revit{5}\\xae{6} MEP', 1.0)),\n",
       " (u'{1}Autodesk{2} Material Library',\n",
       "  (u'{1}Autodesk{2} Material Library', 1.0)),\n",
       " (u'DataManagement', (u'DataManagement', 1.0)),\n",
       " (u'ANY WAY', (u'USE', 1.0)),\n",
       " (u'NoAction', (u'NoAction', 1.0)),\n",
       " (u'View AutoCAD MEP videos', (u'Zeigen Sie AutoCAD MEP-Videos', 1.0)),\n",
       " (u'3ds Max\\xae Design', (u'3ds Max\\xae Design', 1.0)),\n",
       " (u'{1}{2}Autodesk{3}\\xae{4} Revit{5}\\xae{6} Structure',\n",
       "  (u'{1}{2}Autodesk{3}\\xae{4} Revit{5}\\xae{6} Structure', 1.0)),\n",
       " (u'Family Editor>Family Editor', (u'Familieneditor>Familieneditor', 1.0)),\n",
       " (u'the Manufacturing industry', (u'Autodesk-Spezialisierung auf', 1.0)),\n",
       " (u'{1}A Policy', (u'Geometric Design', 1.0)),\n",
       " (u'Moldflow products', (u'Mit Moldflow-Produkten', 1.0)),\n",
       " (u'Segment', (u'Segment', 1.0)),\n",
       " (u'Microsoft Internet Explorer{1}\\xae{2',\n",
       "  (u'Microsoft Internet Explorer{1}\\xae{2', 1.0)),\n",
       " (u'Virginia', (u'Virginia', 1.0)),\n",
       " (u'Figure', (u'Abbildung', 1.0)),\n",
       " (u'Kelvin', (u'Kelvin', 1.0)),\n",
       " (u'July', (u'Juli', 1.0)),\n",
       " (u'This topic lists', (u'diesem Thema', 1.0)),\n",
       " (u'PK.06', (u'PK.06', 1.0)),\n",
       " (u'PK.04', (u'PK.04', 1.0)),\n",
       " (u'PK.08', (u'PK.08', 1.0)),\n",
       " (u'PK.09', (u'PK.09', 1.0)),\n",
       " (u'Many Autodesk design tools', (u'vielen Autodesk-Designwerkzeugen', 1.0)),\n",
       " (u'Assy', (u'Assy', 1.0)),\n",
       " (u'GB/T', (u'GB/T', 1.0)),\n",
       " (u'Legal Notices', (u'Rechtliche Hinweise', 1.0)),\n",
       " (u'the Project tab', (u'Registerkarte Projekt', 1.0)),\n",
       " (u'TIN Surface', (u'Point Cloud', 1.0)),\n",
       " (u'Autodesk\\xae Vault Professional',\n",
       "  (u'Autodesk\\xae Vault Professional', 1.0)),\n",
       " (u'the Storyboard panel toolbar', (u'der Gruppe Storyboard', 1.0)),\n",
       " (u'Guidelines', (u'Richtlinien', 1.0)),\n",
       " (u'Citrix{1}\\xae{2} XenDesktop\\u2122',\n",
       "  (u'Citrix{1}\\xae{2} XenDesktop\\u2122', 1.0)),\n",
       " (u'Autodesk\\xae Infrastructure Design Suite Premium',\n",
       "  (u'Autodesk\\xae Infrastructure Design Suite Premium', 1.0)),\n",
       " (u'the Global option', (u'das Eingabefeld', 1.0)),\n",
       " (u'KC.UpgradeVault', (u'AdvVaultforMfg', 1.0)),\n",
       " (u'FULLPRODNAME =', (u'FULLPRODNAME =', 1.0)),\n",
       " (u'Horizontal distance', (u'Horizontaler', 1.0)),\n",
       " (u'Straw bale', (u'Strohballen', 1.0)),\n",
       " (u'the Windows Start button', (u'} Klicken Sie auf', 1.0)),\n",
       " (u'Kawai', (u'Kawai', 1.0)),\n",
       " (u'Dassault Syst\\xe8mes', (u'Dassault Syst\\xe8mes', 1.0)),\n",
       " (u'Modal Frequency', (u'Modalfrequenz', 1.0)),\n",
       " (u'MARK CHRISTIAN', (u'MARK CHRISTIAN', 1.0)),\n",
       " (u'Maximum fields', (u'Verwenden Sie', 1.0)),\n",
       " (u'Linux platforms', (u'Problem auf', 1.0)),\n",
       " (u'Programs -> Autodesk ->', (u'Vergewissern Sie', 1.0)),\n",
       " (u'MS Office', (u'MS Office', 1.0)),\n",
       " (u'a Windows OS 7 Service Pack', (u'So deinstallieren Sie', 1.0)),\n",
       " (u'AFE{1}1{2', (u'AFE{1}1{2', 1.0)),\n",
       " (u'WebLayout{2', (u'WebLayout{2', 1.0)),\n",
       " (u'The new Autodesk Application Manager',\n",
       "  (u'Der neue Autodesk Application Manager', 1.0)),\n",
       " (u'PK.07', (u'PK.07', 1.0)),\n",
       " (u'KC.ConceptualDesign', (u'KC.ConceptualDesign', 1.0)),\n",
       " (u'Radiation heat balance', (u'Strahlungsw\\xe4rmebilanz', 1.0)),\n",
       " (u'PK.10', (u'PK.10', 1.0)),\n",
       " (u'the {1}% Complete{2', (u'Klicken Sie auf', 1.0)),\n",
       " (u'Autodesk Drainage Design', (u'Autodesk Drainage Design', 1.0)),\n",
       " (u'Carpet', (u'Teppich', 1.0)),\n",
       " (u'VERSION =', (u'VERSION =', 1.0)),\n",
       " (u'each Compute Node', (u'jeden Rechenknoten', 1.0)),\n",
       " (u'the Target field', (u'Folgenden gezeigt', 1.0)),\n",
       " (u'the connected WMS layer name', (u'der folgenden Zeichen enth\\xe4lt', 1.0)),\n",
       " (u'Tim Hudson', (u'Tim Hudson', 1.0)),\n",
       " (u'Plasterboard', (u'Gipsplatte', 1.0)),\n",
       " (u'the Autodesk Mockup 360 dialog box', (u'Dialogfeld Autodesk Mockup', 1.0)),\n",
       " (u'PTC Windchill', (u'PTC Windchill', 1.0)),\n",
       " (u'Procurement', (u'Bauteile', 1.0)),\n",
       " (u'VMware ESXi version', (u'Autodesk Vault', 1.0)),\n",
       " (u'Windows Small Business Server\\u2122',\n",
       "  (u'Windows Small Business Server\\u2122', 1.0)),\n",
       " (u'Autodesk\\xae Infrastructure Design Suite Standard',\n",
       "  (u'Autodesk\\xae Infrastructure Design Suite Standard', 1.0)),\n",
       " (u'Nu{1}a{2', (u'Nu{1}a{2', 1.0)),\n",
       " (u'The remaining MCT state variables',\n",
       "  (u'Die \\xfcbrigen MCT-Zustandsvariablen', 1.0)),\n",
       " (u'the Custom ID', (u'benutzerdefinierte ID', 1.0)),\n",
       " (u'Element type', (u'Elementtyp', 1.0)),\n",
       " (u'Autodesk Simulation 2015{2', (u'Autodesk Simulation 2015{2', 1.0)),\n",
       " (u'a structured Business Process Assessment',\n",
       "  (u'Der Kurs wird Ihnen dabei helfen', 1.0)),\n",
       " (u'your Revit building elements', (u'der Analyse k\\xf6nnen Sie', 1.0)),\n",
       " (u'E{1}11{2', (u'E{1}11{2', 1.0)),\n",
       " (u'an Energy Analytical Model', (u'der Analyse k\\xf6nnen Sie', 1.0)),\n",
       " (u'Image demonstration', (u'AutoCAD MEP', 1.0)),\n",
       " (u'Autodesk AutoCAD ecscad', (u'Autodesk AutoCAD ecscad', 1.0)),\n",
       " (u'{1} Data', (u'NICHT', 1.0)),\n",
       " (u'NX', (u'NX', 1.0)),\n",
       " (u'the ANSYS results', (u'Zustandsvariablen', 1.0)),\n",
       " (u'INTRODUCTION ON SEPARATE SCREEN', (u'SEPARATEM BILDSCHIRM ANZEIGEN', 1.0)),\n",
       " (u'the Limits tab', (u'Sie auf', 1.0)),\n",
       " (u'IViewExtensionHandler', (u'Document\" Type=\"Connectivity', 1.0)),\n",
       " (u'openURL:[NSURL URLWithString', (u'openURL:[NSURL URLWithString', 1.0)),\n",
       " (u'ArcSDE servers', (u'Problem auf', 1.0)),\n",
       " (u'Press SHIFT', (u'Dr\\xfccken Sie', 1.0)),\n",
       " (u'Autodesk{1}\\xae{2}{3', (u'Verbesserte Funktionen', 1.0)),\n",
       " (u'DesignStudy', (u'DesignStudy', 1.0)),\n",
       " (u'Damage evolution', (u'Schadensentwicklung', 1.0)),\n",
       " (u'Vt', (u'Vt', 1.0)),\n",
       " (u'KC.Factory', (u'KC.Factory', 1.0)),\n",
       " (u'Shaker Part', (u'Shaker erstellen', 1.0)),\n",
       " (u'Autodesk Inventor model', (u'Autodesk Inventor-Modell', 1.0)),\n",
       " (u'Wood shingles', (u'Holzschindeln', 1.0)),\n",
       " (u'- Your Serial Number', (u'- Seriennummer', 1.0)),\n",
       " (u'the HIN', (u'Verwenden Sie', 1.0)),\n",
       " (u'DirectX{1}\\xae{2', (u'DirectX{1}\\xae{2', 1.0)),\n",
       " (u'Red Hat Enterprise', (u'Red Hat Enterprise', 1.0)),\n",
       " (u'PDM.MS.Scenario', (u'PDM.MS.Scenario', 1.0)),\n",
       " (u'FRP Laminates', (u'FRP Laminates', 1.0)),\n",
       " (u'Microsoft{1}\\xae', (u'Microsoft{1}\\xae', 1.0)),\n",
       " (u'Infrastructure Design Suite{1', (u'Infrastructure Design Suite{1', 1.0)),\n",
       " (u'SUCH DAMAGE', (u'USE', 1.0)),\n",
       " (u'2015 | Autodesk', (u'2015 | Autodesk', 1.0)),\n",
       " (u'ENTER', (u'Stellt', 1.0)),\n",
       " (u'Manipulator', (u'Manipulator', 1.0)),\n",
       " (u'Document\" Type=\"Connectivity', (u'Document\" Type=\"Connectivity', 1.0)),\n",
       " (u'Scan', (u'Scannen', 1.0)),\n",
       " (u'PDM.SK.AutoSupWrkflow', (u'PDM.SK.AutoSupWrkflow', 1.0)),\n",
       " (u'Advanced graphics', (u'Erweiterte Grafik', 1.0)),\n",
       " (u'Global illumination', (u'Globale Illumination', 1.0)),\n",
       " (u'Regional settings', (u'als Dezimaltrennzeichen', 1.0)),\n",
       " (u'Linear dimension', (u'Linearbema\\xdfung', 1.0)),\n",
       " (u'the Display Options tab', (u'Registerkarte Anzeigeoptionen', 1.0)),\n",
       " (u'Manholes', (u'Sch\\xe4chte', 1.0)),\n",
       " (u'the Shortcut tab', (u'Folgenden gezeigt', 1.0)),\n",
       " (u'Autodesk Revit Architecture Update Release',\n",
       "  (u'Autodesk Revit Architecture Update Release', 1.0)),\n",
       " (u'the Layout page', (u'die Liste', 1.0)),\n",
       " (u'KC.Plastics', (u'KC.Plastics', 1.0)),\n",
       " (u'Microsoft{1}\\xae{2} Windows{3}\\xae{4} 7 Professional',\n",
       "  (u'Microsoft{1}\\xae{2} Windows{3}\\xae{4} 7 Professional', 1.0)),\n",
       " (u'DWF/PDF Options palette', (u'Palette Optionen', 1.0)),\n",
       " (u'Certain assembly models', (u'Bestimmte Baugruppenmodelle', 1.0)),\n",
       " (u'Navisworks Simulate{2', (u'Navisworks Simulate{2', 1.0)),\n",
       " (u'JASON ROTENBERG', (u'JASON ROTENBERG', 1.0)),\n",
       " (u'Valid value', (u'G\\xfcltiger Wert', 1.0)),\n",
       " (u'{1}Edit Profile{2', (u'Klicken Sie auf', 1.0)),\n",
       " (u'All Learning Videos', (u'Alle Lernvideos', 1.0)),\n",
       " (u'Interface file', (u'Schnittstellendatei', 1.0)),\n",
       " (u'Visual Communication', (u'Autodesk InfraWorks', 1.0)),\n",
       " (u'the \"Assessment Results', (u'Assessment Results', 1.0)),\n",
       " (u'Mayes', (u'Mayes', 1.0)),\n",
       " (u'Line Type', (u'Linientyp', 1.0)),\n",
       " (u'\\uf0b7 Ticona', (u'\\uf0b7 Ticona', 1.0)),\n",
       " (u'May', (u'Mai', 1.0)),\n",
       " (u'R-10', (u'R-10', 1.0)),\n",
       " (u'native Inventor parts', (u'Unabh\\xe4ngig davon', 1.0)),\n",
       " (u'Examine', (u'Untersuchen Sie', 1.0)),\n",
       " (u'SOL', (u'SOL', 1.0)),\n",
       " (u'Autodesk AutoCAD Electrical', (u'Autodesk AutoCAD Electrical', 1.0)),\n",
       " (u'Console', (u'Console', 1.0)),\n",
       " (u'THE SOFTWARE', (u'THE SOFTWARE', 1.0)),\n",
       " (u'1}Unassigned Fields{2', (u'Klicken Sie unter', 1.0)),\n",
       " (u'Coleman', (u'Coleman', 1.0)),\n",
       " (u'Wood blocks', (u'Holzbl\\xf6cke', 1.0)),\n",
       " (u'Source Folder{2', (u'Quellordner{2', 1.0)),\n",
       " (u'Design Validation', (u'Entwurfsvalidierung', 1.0)),\n",
       " (u\"User's Guide\", (u'Benutzerhandbuch', 1.0)),\n",
       " (u'IGESOUT', (u'Nur IGESOUT', 1.0)),\n",
       " (u'The numerical value', (u'Der numerische Wert', 1.0)),\n",
       " (u'Solid Works', (u'SolidWorks', 1.0)),\n",
       " (u'the Set option', (u'Wenn Sie', 1.0)),\n",
       " (u'AISC Revision', (u'AISC Revision', 1.0)),\n",
       " (u'Hydraulic Resistance', (u'Hydraulic Resistance', 1.0)),\n",
       " (u'{1} AutoCAD{2}\\xae{3', (u'{1} AutoCAD{2}\\xae{3', 1.0)),\n",
       " (u'Unassigns', (u'Hebt', 1.0)),\n",
       " (u'Product Data Sheet', (u'Produktdatenblatt', 1.0)),\n",
       " (u'Asphalt shingles', (u'Asphaltschindeln', 1.0)),\n",
       " (u'Riser', (u'Riser', 1.0)),\n",
       " (u'panel{3}Create Clip box', (u'Klicken Sie auf', 1.0)),\n",
       " (u'MS.Trano', (u'MS.Trano', 1.0)),\n",
       " (u'Failure criteria', (u'Ausfallkriterien', 1.0)),\n",
       " (u'CAM Express', (u'CAM Express', 1.0)),\n",
       " (u'Supported version', (u'Unterst\\xfctzte Version', 1.0))]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "term_pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import codecs\n",
    "\n",
    "# write terminology to file, sorted by length of source\n",
    "output_terms = sorted(term_pairs, key=lambda x: len(x[0]), reverse=True)\n",
    "\n",
    "all_rules = []\n",
    "\n",
    "terms_output = 'autodesk.noun_chunk.terminology.tsv'\n",
    "with codecs.open(terms_output, 'w', encoding='utf8') as out:\n",
    "    for source_term, (target_term, score) in output_terms:\n",
    "        source_term = tokenize(source_term, 'source')\n",
    "        target_term = tokenize(target_term, 'target')\n",
    "        source_term = bpe_encode(source_term)\n",
    "        target_term = bpe_encode(target_term)\n",
    "        all_rules.append((source_term, target_term, score))\n",
    "        out.write(u'\\t'.join([source_term, target_term, unicode(score)]) + u'\\n')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import codecs\n",
    "\n",
    "# write dev set to file\n",
    "# Note: we could optionally tokenize and BPE encode dev set here\n",
    "\n",
    "def write_lines(lines, filename):\n",
    "    with codecs.open(filename, 'w', encoding='utf8') as out:\n",
    "        out.write(u'\\n'.join(lines))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# En-De Corpus\n",
    "# dev_src_output = 'autodesk.dev.1000.en'\n",
    "# dev_trg_output = 'autodesk.dev.1000.de'\n",
    "\n",
    "# En-Fr Corpus\n",
    "# dev_src_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.dev.1000.en'\n",
    "# dev_trg_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.dev.1000.fr'\n",
    "\n",
    "# En-Pr Corpus\n",
    "dev_src_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.dev.1000.en'\n",
    "dev_trg_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.dev.1000.pt'\n",
    "\n",
    "\n",
    "write_lines(dev_sources, dev_src_output)\n",
    "write_lines(dev_refs, dev_trg_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_src_output = 'autodesk.train.en'\n",
    "# train_trg_output = 'autodesk.train.de'\n",
    "\n",
    "# En-Fr Corpus\n",
    "# train_src_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.train.en'\n",
    "# train_trg_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.train.fr'\n",
    "\n",
    "# En-Pt Corpus\n",
    "train_src_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.train.en'\n",
    "train_trg_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.train.pt'\n",
    "\n",
    "write_lines(train_sources, train_src_output)\n",
    "write_lines(train_refs, train_trg_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# how many constraints occur in the dev set?\n",
    "source_constraints = set([src_term for src_term, (_, _) in output_terms])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "532"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(source_constraints)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "154"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "src_constraint_occs = 0\n",
    "for source_sen in dev_sources:\n",
    "    for cons in source_constraints:\n",
    "        if cons in source_sen:\n",
    "            src_constraint_occs += 1\n",
    "            break\n",
    "src_constraint_occs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "src_rules, trg_rules, rule_scores = zip(*all_rules)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import the tokenized and bpe encoded dev lines\n",
    "prepped_dev_lines = codecs.open('/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/autodesk.dev.1000.en.bpe',\n",
    "                                 encoding='utf8').read().strip().split('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create terminology spotter\n",
    "from semantic_annotator.spotting import MatchSpotter\n",
    "\n",
    "term_spotter = MatchSpotter(rules=src_rules)\n",
    "\n",
    "\n",
    "# get spots in dev set us"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "term_pair_map = OrderedDict((k,v) for k,v,s in all_rules)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dev_term_spots = []\n",
    "for l in prepped_dev_lines:\n",
    "    spots = term_spotter.get_spots(l)\n",
    "    dev_term_spots.append(spots)\n",
    "\n",
    "dev_term_constraints = []\n",
    "for text, spots in zip(prepped_dev_lines, dev_term_spots):\n",
    "    output_constraints = []\n",
    "    if len(spots) > 0:\n",
    "        for spot in spots:\n",
    "            if spot[1] - spot[0] > 1:\n",
    "                spotted_term = text[spot[0]:spot[1]]\n",
    "                mapped_term = term_pair_map[spotted_term]\n",
    "                output_constraints.append(mapped_term.split())\n",
    "    dev_term_constraints.append(output_constraints)\n",
    "#                 print(u'Spotted: {}, mapping is: {}'.format(spotted_term, term_pair_map[spotted_term]))\n",
    "#     else:\n",
    "#         print('No spot in line')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1000"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dev_term_constraints)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "OUTPUT_DIR='/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus'\n",
    "\n",
    "with codecs.open(os.path.join(OUTPUT_DIR, 'dev.constraints.json'), 'w', encoding='utf8') as out:\n",
    "    out.write(json.dumps(dev_term_constraints, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'Autodesk Inventor',\n",
       " u'Simulation CFD',\n",
       " u'Autodesk Vault',\n",
       " u'Service Pack',\n",
       " u'Autodesk InfraWorks',\n",
       " u'Inventor LT',\n",
       " u'Simulation Mechanical',\n",
       " u'Autodesk\\xae Simulation CFD',\n",
       " u'AutoCAD LT',\n",
       " u'Autodesk Revit']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_1000_src[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CHUNK: the fluid, top 5: [(u'i', 0.07109004739336493), (u'das Fluid', 0.014218009478672985), (u'nichtnewtonschen Fluid wird', 0.014218009478672985), (u'Fluid', 0.009478672985781991), (u'das', 0.009478672985781991), (u'bei der das Fluid zu', 0.009478672985781991), (u'Gleichgewicht aufgrund der Schwerkraft', 0.009478672985781991), (u'ihrem Verlauf m\\xfcssen fein genug sein', 0.009478672985781991), (u'System', 0.009478672985781991), (u'Fluid aus\\xfcbt', 0.009478672985781991), (u'Bewegung', 0.009478672985781991), (u'ist', 0.009478672985781991), (u'W\\xe4rmeleitf\\xe4higkeit des Fluids', 0.009478672985781991), (u'und', 0.009478672985781991), (u'Interaktion zwischen Festk\\xf6rper', 0.009478672985781991), (u'Watt', 0.009478672985781991), (u'der', 0.009478672985781991), (u'Interaktion zwischen Festk\\xf6rper und dem Fluid angemessen darzustellen', 0.009478672985781991), (u'dass', 0.009478672985781991), (u'Dichte des Fluids', 0.009478672985781991), (u'Fluid vom Auslass bis zum Einlass', 0.009478672985781991), (u'1}Die Ausstattung erm\\xf6glicht', 0.004739336492890996), (u'dem Festk\\xf6rper gefangen ist', 0.004739336492890996), (u'Wenn das Ventil durch', 0.004739336492890996), (u'Druck', 0.004739336492890996), (u'der Str\\xf6mung', 0.004739336492890996), (u'vorgegebene Durchfluss kann als Randbedingung', 0.004739336492890996), (u'{1} f\\xfcr', 0.004739336492890996), (u'Grenzspannungsrate{1', 0.004739336492890996), (u'Das Netz', 0.004739336492890996), (u'Dies kennzeichnet', 0.004739336492890996), (u'Bei der ersten wird', 0.004739336492890996), (u'Berechnen Sie', 0.004739336492890996), (u'Ger\\xe4te wie Pumpen und L\\xfcfter \\xfcbertragen Energie auf', 0.004739336492890996), (u', wobei q{1}ri{2} der Nettow\\xe4rmestrom zum Fluid an Element i von der Strahlung von Oberfl\\xe4che zu Oberfl\\xe4che darstellt', 0.004739336492890996), (u'Festk\\xf6rper', 0.004739336492890996), (u'Festk\\xf6rper bewegen', 0.004739336492890996), (u'Bewegungsdefinition', 0.004739336492890996), (u'das Fluid von den Randbedingungen der W\\xe4rme\\xfcbertragung an der Wand aufgenommen hatte', 0.004739336492890996), (u'Grenzviskosit\\xe4t{1', 0.004739336492890996), (u'Mit nur einer Reihe von maskierten Fluidknoten kann nur', 0.004739336492890996), (u'an die W\\xe4nde abgegebene Restw\\xe4rme berechnet', 0.004739336492890996), (u'Energiemenge', 0.004739336492890996), (u'Fluidelemente', 0.004739336492890996), (u'Um dieser Situation zu begegnen', 0.004739336492890996), (u'1}Hydraulisches Drehmoment{2', 0.004739336492890996), (u'dem Fluid', 0.004739336492890996), (u'Die Umformungsgeschwindigkeit', 0.004739336492890996), (u'sodass', 0.004739336492890996), (u'W\\xe4hrend der Iterationen des Mehrphasen-Solvers', 0.004739336492890996)]\n",
      "CHUNK: the viewport, top 5: [(u'i', 0.140625), (u'Ansichtsfenster', 0.03125), (u'Ansichtsfenster angezeigt', 0.0234375), (u'Ansichtsfenster erstellen k\\xf6nnen', 0.01171875), (u'Ihrer Zeichnungsdatei', 0.01171875), (u'Schaltfl\\xe4che', 0.01171875), (u'geografischen Positionsdaten', 0.01171875), (u'Die Karte', 0.01171875), (u'der', 0.01171875), (u'anderen Zeichnungsobjekten i', 0.01171875), (u'Sie', 0.01171875), (u'verwendet', 0.01171875), (u'Steuerelement', 0.01171875), (u'Ansichtsfenster auf', 0.01171875), (u'Wenn Sie', 0.0078125), (u'Verwirbelung', 0.0078125), (u'das Ansichtsfenster', 0.0078125), (u'Dialogfeld', 0.0078125), (u'Klicken Sie auf', 0.0078125), (u'Szene', 0.0078125), (u'Ansichtsfensters', 0.0078125), (u'Ansichtsfenster aus', 0.0078125), (u'wenn Sie', 0.00390625), (u'Ansichtsfenster nicht unterst\\xfctzt und Streugewichtungs-Remap', 0.00390625), (u'Platte aus dem Ansichtsfenster als {1}Region{2}, {3}AS4-PEEK{4} als', 0.00390625), (u'Ansichtsfenster zeigen als', 0.00390625), (u'das Ansichtsfenster m\\xf6glicherweise mehrmals', 0.00390625), (u'sodass durch', 0.00390625), (u'der Befehlszeile eingeben', 0.00390625), (u'Erstellt einen Helfer', 0.00390625), (u'Randbedingung', 0.00390625), (u'Rendern auf ein', 0.00390625), (u'3D-Ansicht fest', 0.00390625), (u'Bei der Verwendung der Einbackoption wird das Ansichtsfenster m\\xf6glicherweise mehrmals neu gezeichnet, wenn Sie Frames nach dem Einbacken \\xe4ndern und dann auf R\\xfcckg\\xe4ngig klicken', 0.00390625), (u'Dies', 0.00390625), (u'verwendet wird', 0.00390625), (u'von der \\xc4nderung betroffen', 0.00390625), (u'Schaltfl\\xe4che {1}Datum CSYS List', 0.00390625), (u'das Ansichtsfenster eingepasst', 0.00390625), (u'auf', 0.00390625), (u'W\\xe4hlen Sie das Ansichtsfenster mit den Objekten aus', 0.00390625), (u'Reflexionen \\xe4ndern', 0.00390625), (u'es wird automatisch i', 0.00390625), (u'Anzeige im Ansichtsfenster anzupassen', 0.00390625), (u'Schwenken', 0.00390625), (u'Quelle der Bauteilliste auszuw\\xe4hlen', 0.00390625), (u'eine Datei rendern', 0.00390625), (u'als Bereich f\\xfcr', 0.00390625), (u'intuitive Auswahlwerkzeuge', 0.00390625), (u'sie', 0.00390625)]\n",
      "CHUNK: the amount, top 5: [(u'Dies', 0.030042918454935622), (u'der', 0.017167381974248927), (u'i', 0.017167381974248927), (u'zu', 0.012875536480686695), (u'ben\\xf6tigt wird', 0.012875536480686695), (u'Garn(en', 0.008583690987124463), (u'aus\\xfcbt', 0.008583690987124463), (u'das', 0.008583690987124463), (u') ist', 0.008583690987124463), (u'Materials', 0.008583690987124463), (u'dem Wachstum der', 0.008583690987124463), (u'der f\\xfcr', 0.008583690987124463), (u'F\\xe4llen', 0.008583690987124463), (u'Abaqus/Explicit-Analysen verwendet und gibt den Grad der Verst\\xe4rkung', 0.008583690987124463), (u'W\\xe4rmemenge', 0.008583690987124463), (u'das Drehmoment', 0.008583690987124463), (u'Wenn', 0.008583690987124463), (u'Energiemenge dar', 0.008583690987124463), (u'Menge', 0.008583690987124463), (u'thermische Energiebilanz und protokolliert diese sowie den Betrag der W\\xe4rme\\xfcbertragung aufgrund von W\\xe4rmestrahlung f\\xfcr', 0.004291845493562232), (u'enth\\xe4lt diese Zeile', 0.004291845493562232), (u'denen', 0.004291845493562232), (u'der Einlassstr\\xf6mung', 0.004291845493562232), (u'Es wird dadurch allerdings', 0.004291845493562232), (u'Das System kann', 0.004291845493562232), (u'Gases', 0.004291845493562232), (u'von den Ausl\\xe4ssen', 0.004291845493562232), (u'Daten genau', 0.004291845493562232), (u'Ende des Werkzeugs', 0.004291845493562232), (u'Verwaltung von Standard- und anderen Einstellungen sowie f\\xfcr Materialien erforderlich ist', 0.004291845493562232), (u'Grad des zul\\xe4ssigen', 0.004291845493562232), (u'dass Sie mit Abaqus/Explicit', 0.004291845493562232), (u'Strahlungsenergiebilanz zu verfolgen', 0.004291845493562232), (u'Menge der reflektierten Energie', 0.004291845493562232), (u'Dauer f\\xfcr das Hoch-', 0.004291845493562232), (u'vier notwendigen Parameter f\\xfcr', 0.004291845493562232), (u'Analyse nicht stabil ist', 0.004291845493562232), (u'Energiemenge', 0.004291845493562232), (u'Bereichen', 0.004291845493562232), (u'Detailgenauigkeit{1', 0.004291845493562232), (u'zu wissen', 0.004291845493562232), (u'Zu jedem Bauteil', 0.004291845493562232), (u'Auswirkungen auf dem Ger\\xe4t zu simulieren', 0.004291845493562232), (u'Gr\\xf6\\xdfe vieler der zu hochzuladenden und herunterzuladenden Dateien reduziert', 0.004291845493562232), (u'Karte', 0.004291845493562232), (u'Konsistenz zwischen Simulationen sichergestellt und der Zeitaufwand f\\xfcr das Setup nachfolgender Entwurfsiterationen reduziert', 0.004291845493562232), (u'Zeit-', 0.004291845493562232), (u'einer Geometrie kann der zur Berechnung', 0.004291845493562232), (u'wenn Str\\xf6mungs-', 0.004291845493562232), (u'Mit dieser Assoziativit\\xe4t', 0.004291845493562232)]\n",
      "CHUNK: Options, top 5: [(u'Dialogfeld Optionen', 0.19469026548672566), (u'Dialogfeld Optionen > Registerkarte SteeringWheels', 0.07079646017699115), (u'i', 0.05309734513274336), (u'Dialogfeld Optionen > Registerkarte Grafiken', 0.04424778761061947), (u'Optionen', 0.017699115044247787), (u'klicken Sie auf Optionen', 0.017699115044247787), (u'Zu den Optionen geh\\xf6ren', 0.017699115044247787), (u'Zu den Optionen', 0.017699115044247787), (u'Dialogfeld Optionen > Registerkarte Allgemein', 0.017699115044247787), (u'Registerkarte SteeringWheels', 0.017699115044247787), (u'oder Wechseln zwischen Profilen', 0.008849557522123894), (u'Schriftart', 0.008849557522123894), (u'Querprofilpl\\xe4nen verf\\xfcgbar', 0.008849557522123894), (u'Folgende Optionen sind verf\\xfcgbar', 0.008849557522123894), (u'Registerkarte Verzeichnisse > Standardpfad f\\xfcr Benutzerdateien', 0.008849557522123894), (u'Registerkarte Grafiken > Auswahl', 0.008849557522123894), (u'Treppenbalken', 0.008849557522123894), (u'Registerkarte Grafiken > Farben > Halbtransparent', 0.008849557522123894), (u'Visualisierungsdateien', 0.008849557522123894), (u'Stammpfad', 0.008849557522123894), (u'Konfiguration eines', 0.008849557522123894), (u'Verwaltung von Normalen und das Zusammenf\\xfchren von Scheitelpunkten und Fl\\xe4chen', 0.008849557522123894), (u'Suche', 0.008849557522123894), (u'\\xe4ndern', 0.008849557522123894), (u'Steuerung f\\xfcr jede Ansicht', 0.008849557522123894), (u'f\\xfcr', 0.008849557522123894), (u'Optionen f\\xfcr verschiedene Standorte', 0.008849557522123894), (u'Registerkarte Grafiken > Vorauswahl', 0.008849557522123894), (u'Textdarstellung f\\xfcr tempor\\xe4re Bema\\xdfung', 0.008849557522123894), (u'Planlistenteuerung', 0.008849557522123894), (u'Options aus', 0.008849557522123894), (u'Optionen f\\xfcr', 0.008849557522123894), (u'Dialogfeld Optionen > Registerkarte Allgemein > Protokolldatei bereinigen > dann Protokolle l\\xf6schen', 0.008849557522123894), (u'Linien', 0.008849557522123894), (u'Liste der Optionen f\\xfcr', 0.008849557522123894), (u'Optionen zum Hinzuf\\xfcgen', 0.008849557522123894), (u'Es sind Optionen f\\xfcr das', 0.008849557522123894), (u'Dialogfeld Optionen > Registerkarte Allgemein > Benachrichtigungen > Erinnerungsintervall - Mit Zentraldatei synchronisieren', 0.008849557522123894), (u'Verwaltung von Normalen', 0.008849557522123894), (u'werden', 0.008849557522123894), (u'Informationen zu Einstellungsoptionen', 0.008849557522123894), (u'oder F\\xfcr', 0.008849557522123894), (u'Handlauf', 0.008849557522123894), (u'Ansichten verwenden', 0.008849557522123894), (u'Hinzuf\\xfcgen oder Entfernen', 0.008849557522123894), (u'Hauptwerkzeugkasten auf Optionen', 0.008849557522123894), (u'L\\xf6schen von Stationen', 0.008849557522123894), (u'Dialogfeld Grafikdarstellungsoptionen zulassen', 0.008849557522123894), (u'Dialogfeld Optionen auf der Registerkarte Allgemein den Wert f\\xfcr Messgr\\xf6\\xdfe', 0.008849557522123894), (u'Registerkarte Verzeichnisse', 0.008849557522123894)]\n",
      "CHUNK: the edge, top 5: [(u'der', 0.03636363636363636), (u'an der', 0.02727272727272727), (u'W\\xe4hlen Sie', 0.01818181818181818), (u'Klicken Sie auf', 0.013636363636363636), (u'i', 0.013636363636363636), (u'Kante', 0.00909090909090909), (u'auf', 0.00909090909090909), (u'dass der Werkzeugrand', 0.00909090909090909), (u'Oberseite der Grundplatte', 0.00909090909090909), (u'oder', 0.00909090909090909), (u'Sie dann auf', 0.00909090909090909), (u'n\\xe4chsten liegt', 0.00909090909090909), (u'sicherzustellen', 0.00909090909090909), (u'w\\xe4hlen Sie', 0.00909090909090909), (u'I', 0.00909090909090909), (u'Ebene anzugeben', 0.00909090909090909), (u'Sie geklickt haben', 0.00909090909090909), (u'Mithilfe des Griffs', 0.00909090909090909), (u'Kante erneut ausw\\xe4hlen', 0.004545454545454545), (u', und w\\xe4hlen Sie', 0.004545454545454545), (u'der folgenden Abbildung dargestellte', 0.004545454545454545), (u'Schneide auf dem Daumennagel balanciert', 0.004545454545454545), (u'senkrecht', 0.004545454545454545), (u'Dabei werden mikroskopisch kleine H\\xe4rchen von', 0.004545454545454545), (u'17\\xa0Querschnittsbestandteile', 0.004545454545454545), (u'Begrenzung \\xfcberlappt', 0.004545454545454545), (u'Sie auf Punkte entlang der Kante', 0.004545454545454545), (u'der Mitte oder', 0.004545454545454545), (u'Sonne', 0.004545454545454545), (u'Tragende', 0.004545454545454545), (u'helles', 0.004545454545454545), (u'Knoten auf der Kante zwischen den beiden oberen Volumina', 0.004545454545454545), (u'(helllila) besitzt wie dargestellt', 0.004545454545454545), (u'Kante der Fahrspur', 0.004545454545454545), (u'Kante entlang der Erweiterung verschoben werden', 0.004545454545454545), (u'Breite der Bema\\xdfung bearbeiten', 0.004545454545454545), (u'Sie dann', 0.004545454545454545), (u'prim\\xe4ren Stra\\xdfe', 0.004545454545454545), (u'Falls Sie', 0.004545454545454545), (u'wieder zur\\xfcckziehen', 0.004545454545454545), (u'Bei seiner Methode zur Klingenpr\\xfcfung', 0.004545454545454545), (u'halten wir', 0.004545454545454545), (u'Alle verdeckten Linien f\\xfcr Plattenkanten anzeigen', 0.004545454545454545), (u'bis zum Rand', 0.004545454545454545), (u'breitet sich bis zum endg\\xfcltigen Versagen seitlich', 0.004545454545454545), (u'Die', 0.004545454545454545), (u'Kante eines', 0.004545454545454545), (u'} und', 0.004545454545454545), (u'gesamte Extrusionsrichtung gleichm\\xe4\\xdfig verl\\xe4uft', 0.004545454545454545), (u'Kante von Arbeitsebene1 aus', 0.004545454545454545)]\n",
      "CHUNK: people, top 5: [(u'herumlaufen werden', 0.015), (u'der', 0.015), (u'auf der Personen stehen', 0.015), (u'Ziehen Sie den Mauszeiger \\xfcber den Boden', 0.015), (u'i', 0.015), (u'Anschlie\\xdfend', 0.01), (u'so ziemlich das erste', 0.01), (u'sie zu sehen bekommen', 0.01), (u'der Liste der Personen ausw\\xe4hlen', 0.01), (u'zu dieser', 0.01), (u'Der Stil soll eine typische Gruppe von Personen darstellen', 0.01), (u'sind', 0.01), (u'Personen', 0.01), (u'Sie einen Kontakt', 0.01), (u'Sie', 0.01), (u'vom Eigent\\xfcmer der Zeichnung', 0.01), (u'Wenn sich', 0.01), (u'Flusses verlassen und', 0.005), (u'indem Sie zweimal', 0.005), (u'einem anderen erscheinen', 0.005), (u': {1}Howell:{2} Seien Sie bereit', 0.005), (u'Raum auf der Etage einem anderen', 0.005), (u'Leute damit zu beeindrucken', 0.005), (u'deaktiviert', 0.005), (u'nicht \\xfcber', 0.005), (u'Dies', 0.005), (u'Einfach ausgedr\\xfcckt geht', 0.005), (u'das Werkzeug', 0.005), (u'Wenn Sie', 0.005), (u'Schiffe', 0.005), (u'Flussbereichen bewegen und', 0.005), (u'{1}Analytische Informationen sammeln:{2} Wenn wir besser verstehen', 0.005), (u'Die Vision von Autodesk ist es', 0.005), (u'Klaren sein', 0.005), (u'Projekten', 0.005), (u'Wenn ein hungriger Hund viel davon frisst, kann', 0.005), (u'[] kennzeichnet {1}Pro{2}-spezifische Inhalte und ist nur f\\xfcr Personen verf\\xfcgbar', 0.005), (u'der Tabelle aus', 0.005), (u'Markieren von Personen', 0.005), (u'RPC-Elemente aus Revit-Modellen', 0.005), (u'Simulation', 0.005), (u'Fluss angezeigt', 0.005), (u'Gelegenheit', 0.005), (u'Schlagen Sie vor', 0.005), (u'wobei', 0.005), (u'wenn', 0.005), (u'Layer', 0.005), (u'Aber', 0.005), (u'die', 0.005), (u'\\xd6ffnet', 0.005)]\n",
      "CHUNK: problems, top 5: [(u'i', 0.021739130434782608), (u'der', 0.017391304347826087), (u'Dieses', 0.013043478260869565), (u'Dieses Service Pack behebt auch die', 0.013043478260869565), (u'Sie', 0.013043478260869565), (u'behebt', 0.013043478260869565), (u'durch Autodesk AutoCAD', 0.008695652173913044), (u'umfasst nicht', 0.008695652173913044), (u'Str\\xf6mung blockieren', 0.008695652173913044), (u'laminaren Bereiche', 0.008695652173913044), (u'lokalen Str\\xf6mungsgeschwindigkeiten verringern', 0.008695652173913044), (u'Dies', 0.008695652173913044), (u'senden Sie eine E-Mail', 0.008695652173913044), (u'm\\xf6glicherweise', 0.008695652173913044), (u'dass', 0.008695652173913044), (u'Problemen mit gro\\xdfen', 0.008695652173913044), (u'Wenn Sie auf Probleme sto\\xdfen', 0.008695652173913044), (u'den Begrenzungs-Layern', 0.008695652173913044), (u'Maustaste darauf klicken', 0.004347826086956522), (u'Datei zur Analyse \\xfcbermittelt wird', 0.004347826086956522), (u'dieser Version', 0.004347826086956522), (u'Programme keine der Befehle', 0.004347826086956522), (u'mit Problemen und der Ausf\\xfchrung von Aufgaben', 0.004347826086956522), (u'bei denen', 0.004347826086956522), (u'm\\xf6glicherweise nur mit einer ganzheitlichen Betrachtung des Projekts', 0.004347826086956522), (u'Probleme mit Freistrahl', 0.004347826086956522), (u'Modelle entwickeln', 0.004347826086956522), (u'geben Sie Ihren Anforderungscode sowie', 0.004347826086956522), (u'Wichtige Informationen zum Aufheben', 0.004347826086956522), (u'Wenn Probleme beim Laden', 0.004347826086956522), (u'Identifizieren Sie Probleme', 0.004347826086956522), (u'wurden oder nicht mehr unterst\\xfctzt werden', 0.004347826086956522), (u'erfordern daher eine', 0.004347826086956522), (u'Oracle 12cR1 ist', 0.004347826086956522), (u'das Pr\\xfcfprotokoll zur Fehlerbehebung verwenden', 0.004347826086956522), (u'Ansonsten kann es zu Problemen bei Materialzuweisung', 0.004347826086956522), (u'Computer au\\xdferhalb der', 0.004347826086956522), (u'entsprechenden Mengen im{3}Entwurf fehlen.{4}Eine Bauablaufsimulation', 0.004347826086956522), (u'mit Temperatur\\xe4nderungen ist m\\xf6glicherweise', 0.004347826086956522), (u'zur Erkennung von Problemen wie:{2}Vers\\xe4umnis', 0.004347826086956522), (u'fachbereichs\\xfcbergreifender BIM-Koordination', 0.004347826086956522), (u'Allerdings', 0.004347826086956522), (u'Aufgrund von ausf\\xfchrlichen Informationen', 0.004347826086956522), (u'Ausf\\xe4lle im Modell beeinflussen k\\xf6nnen', 0.004347826086956522), (u'Gr\\xf6\\xdfenbestimmung', 0.004347826086956522), (u'zeigt, dass Aushub \\xfcber noch nicht gebaute Strukturen bewegt wird.{5}Dies sind Dinge', 0.004347826086956522), (u'Support@autodesk.com{2} senden', 0.004347826086956522), (u'da', 0.004347826086956522), (u'pr\\xfcfen Sie', 0.004347826086956522), (u'Datei zur Analyse', 0.004347826086956522)]\n",
      "CHUNK: your design, top 5: [(u'der', 0.025), (u'Sie', 0.025), (u'i', 0.02), (u'Referenzierung nicht rechteckiger Modellfl\\xe4chen', 0.015), (u'beliebigen Formen erleichtert', 0.015), (u'Das Zuschneiden von Bereichen', 0.015), (u'Sie mit Alias Werkzeuge', 0.01), (u'Komplexit\\xe4t der Konstruktion oder', 0.01), (u'denselben Typ auch f\\xfcr bereits', 0.01), (u'Der Kunde zeigt Ihre Konstruktion', 0.01), (u'bestehenden Segmente und Formteile aus, und klicken Sie auf der Registerkarte \\xc4ndern auf', 0.01), (u'Dank der', 0.01), (u'nach Gr\\xf6\\xdfe', 0.01), (u'zu optimieren', 0.01), (u'Routing-Voreinstellungen\\xb7oder\\xb7die\\xb7\\ufffc{1}Einstellung Winkel unter HLS-Einstellungen{2}\\ufffc\\xb7\\xe4ndern', 0.01), (u'Ihrem Entwurf', 0.01), (u'Verwenden Sie', 0.01), (u'w\\xe4hlen Sie', 0.01), (u'Configurator', 0.01), (u'Entwurf \\xfcbernehmen', 0.01), (u'Ihrem Design hin', 0.005), (u'Simulationssoftware k\\xf6nnen Sie vor der Fertigung', 0.005), (u'einem Entwurf einzuf\\xfcgen', 0.005), (u'verwendet anschlie\\xdfend', 0.005), (u'Sie haben auch die', 0.005), (u'Bedingungen', 0.005), (u'Kommunizieren Sie Ihre Entw\\xfcrfe', 0.005), (u'Jetzt k\\xf6nnen Sie', 0.005), (u'Alias weist dieses Werkzeug zur Fl\\xe4chenbeurteilung auf Flecken mit gleicher Helligkeit', 0.005), (u'selbst', 0.005), (u'Ihren Entwurf erl\\xe4utern', 0.005), (u'Konstruktion zu animieren', 0.005), (u'Erstellen und variieren Sie Ihre Entw\\xfcrfe effizient', 0.005), (u'direkt als PDF-Datei plotten', 0.005), (u'Eigenschaften tempor\\xe4rer Ansichten erm\\xf6glichen Ihnen professionelle Entw\\xfcrfe', 0.005), (u'Abh\\xe4ngigkeit', 0.005), (u'Angenommen', 0.005), (u'Revit noch einfacher verwalten', 0.005), (u'Zum Erstellen von Videos', 0.005), (u'Dar\\xfcber', 0.005), (u'\\xdcberarbeitung von Dateien', 0.005), (u'beste L\\xf6sung f\\xfcr Sie', 0.005), (u'Pr\\xfcfen der Konstruktion', 0.005), (u'Funktionsweise eines Produkts unter', 0.005), (u'\\xc4nderungen hinzu', 0.005), (u'Eine typische Zeichnungsaktualisierung beginnt mit dem Auschecken einer Konstruktion aus dem Tresor', 0.005), (u'Der Kunde konfiguriert Ihre Konstruktion', 0.005), (u'der Zeichnung zuordnen', 0.005), (u'CAD Design-Geopositionsfunktionen reale Umgebungen', 0.005), (u'Ihre Konstruktion', 0.005)]\n",
      "CHUNK: the BOM, top 5: [(u'der', 0.06880733944954129), (u'der St\\xfcckliste', 0.03669724770642202), (u'Werte der einzelnen Positionen', 0.022935779816513763), (u'Sie', 0.022935779816513763), (u'das h\\xf6her ist als', 0.01834862385321101), (u'St\\xfcckliste', 0.01834862385321101), (u'Rollup-Berechnung', 0.013761467889908258), (u'Revision', 0.013761467889908258), (u'der St\\xfcckliste angezeigten Werten unterscheidet', 0.013761467889908258), (u'von den', 0.013761467889908258), (u'dass', 0.013761467889908258), (u'Wenn', 0.013761467889908258), (u'Rollup', 0.009174311926605505), (u'der Ausgangswert beispielsweise von der', 0.009174311926605505), (u'Anzeigen der St\\xfcckliste', 0.009174311926605505), (u'Wenn Sie', 0.009174311926605505), (u'Datum ausw\\xe4hlen', 0.009174311926605505), (u'identifiziert das System', 0.009174311926605505), (u'ein Feld als', 0.009174311926605505), (u'Wenn der Ausgangswert beispielsweise von der Registerkarte', 0.009174311926605505), (u'oder Sie m\\xf6chten sich', 0.009174311926605505), (u'erl\\xe4utert', 0.009174311926605505), (u'Findet', 0.009174311926605505), (u'und konfiguriert', 0.009174311926605505), (u'Das Symbol wird mit einer QuickInfo angezeigt', 0.009174311926605505), (u'dem', 0.009174311926605505), (u'f\\xfchrte', 0.009174311926605505), (u'w\\xe4hlen Sie', 0.009174311926605505), (u'wird eine Funktion', 0.009174311926605505), (u'angewendet', 0.009174311926605505), (u'sind oder ein \\xfcbergeordneter Wert', 0.009174311926605505), (u'Beispielsweise wurde', 0.0045871559633027525), (u'kleinste Zahl', 0.0045871559633027525), (u'das Feld aus der St\\xfcckliste oder auf der Seite Artikeldetails aus, dessen Wert i', 0.0045871559633027525), (u'Eine St\\xfcckliste mit eindeutigen Ebenen-', 0.0045871559633027525), (u'Datentyp gesucht', 0.0045871559633027525), (u'1}Validierungen und Abh\\xe4ngigkeiten ignorieren', 0.0045871559633027525), (u'Wenn Sie eine', 0.0045871559633027525), (u'Rollup-Felder erm\\xf6glichen', 0.0045871559633027525), (u'G\\xfcltigkeit verwenden', 0.0045871559633027525), (u'der aktuell ausgew\\xe4hlten Artikelrevision liegt, identifiziert das System', 0.0045871559633027525), (u'Beispielsweise', 0.0045871559633027525), (u'Vor dem Import der St\\xfcckliste werden', 0.0045871559633027525), (u'Revision der einzelnen Artikel', 0.0045871559633027525), (u'St\\xfcckliste auf der Registerkarte', 0.0045871559633027525), (u'eine oder', 0.0045871559633027525), (u'sie', 0.0045871559633027525), (u'Wenn das Datum', 0.0045871559633027525), (u'w\\xe4hlt rekursiv', 0.0045871559633027525), (u'anderen Eingabefelds', 0.0045871559633027525)]\n",
      "CHUNK: fiber failure, top 5: [(u'Wenn', 0.03187250996015936), (u'Faser hat versagt', 0.01593625498007968), (u'immer aktualisiert', 0.01593625498007968), (u'energiebasierte Degradation aktiviert ist', 0.01593625498007968), (u'Anzahl der Zyklen bis zum Versagen auf den Wert bei Faserversagen gesetzt', 0.01593625498007968), (u'sei denn', 0.01593625498007968), (u'diesem Fall wird', 0.01593625498007968), (u'i', 0.01593625498007968), (u'sodass die roten Elemente', 0.01593625498007968), (u'Dieser Wert', 0.01593625498007968), (u'allen Garnen', 0.01195219123505976), (u'als', 0.00796812749003984), (u'allen Verbundlagen unter', 0.00796812749003984), (u'Das hei\\xdft', 0.00796812749003984), (u'Tritt jedoch ein Faserversagen auf', 0.00796812749003984), (u'Steifheit der Faserkonstituente dieselbe', 0.00796812749003984), (u'dem Verbundspannungszustand bei Faserversagen', 0.00796812749003984), (u'Genau', 0.00796812749003984), (u'gesagt werden', 0.00796812749003984), (u'Spektrum', 0.00796812749003984), (u'roten Elemente entsprechen dem Matrixversagen', 0.00796812749003984), (u'dieser Energie', 0.00796812749003984), (u'durch', 0.00796812749003984), (u'Grenzwerte von Abaqus automatisch auf einen Wertebereich von', 0.00796812749003984), (u'Faserversagen statt dem Matrixversagen entsprechen', 0.00796812749003984), (u'bleibt', 0.00796812749003984), (u'bis 2 angepasst und', 0.00796812749003984), (u'Anders ausgedr\\xfcckt', 0.00796812749003984), (u'Wenn es z. B.', 0.00796812749003984), (u'Auswirkung des Faserausfalls auf', 0.00796812749003984), (u'der', 0.00796812749003984), (u'dass', 0.00796812749003984), (u'dem Elementvolumen', 0.00796812749003984), (u'Faserfehler', 0.00796812749003984), (u'Aber bei einem Faserfehler wird der Bereich auf', 0.00796812749003984), (u'Verbundwerkstoff', 0.00796812749003984), (u'f\\xfcr den', 0.00796812749003984), (u'es gibt', 0.00398406374501992), (u'Bei positiver transversaler Normalspannung', 0.00398406374501992), (u'bietet', 0.00398406374501992), (u'dem Spannungszustand des Verbundwerkstoffs bei Matrixausfall', 0.00398406374501992), (u'Bei positiver l\\xe4ngsseitiger Normalspannung', 0.00398406374501992), (u'Verbundwerkstoffe {1}, {2}, {3}, {4} und {5} zerfallen nach einem Matrixausfallereignis entsprechend dieser Energie', 0.00398406374501992), (u'gr\\xf6\\xdfer als', 0.00398406374501992), (u'SVAR1-Wert', 0.00398406374501992), (u'Der Faserausfall hingegen wird als isotropes Ereignis betrachtet', 0.00398406374501992), (u'Faserausfall aufgetreten ist', 0.00398406374501992), (u'den Faserausfall ein', 0.00398406374501992), (u'Versagen ist', 0.00398406374501992), (u'Andernfalls', 0.00398406374501992)]\n"
     ]
    }
   ],
   "source": [
    "for src_chunk, prior in src_priors.items()[300:310]:\n",
    "    print(u'CHUNK: {}, top 5: {}'.format(src_chunk, src_posteriors[src_chunk].items()[:50]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(u'who', 0.0019538882375928096),\n",
       " (u'order', 0.0019445840078899867),\n",
       " (u'the ribbon', 0.0019445840078899867),\n",
       " (u'use', 0.0019352797781871638),\n",
       " (u'the part', 0.001925975548484341),\n",
       " (u'a drawing', 0.0018794543999702264),\n",
       " (u'addition', 0.0018701501702674036),\n",
       " (u'BIM', 0.0018422374811589348),\n",
       " (u'Select', 0.001823629021753289),\n",
       " (u'the name', 0.0018050205623476433),\n",
       " (u'the design', 0.0017957163326448203),\n",
       " (u'a file', 0.0017957163326448203),\n",
       " (u'insulation', 0.0017212824950222372),\n",
       " (u'the top', 0.0017119782653194143),\n",
       " (u'(100 mm', 0.0017026740356165912),\n",
       " (u'the flow', 0.0016747613465081226),\n",
       " (u'models', 0.0016747613465081226),\n",
       " (u'the display', 0.0016747613465081226),\n",
       " (u'Autodesk Inventor', 0.0016654571168052998),\n",
       " (u'design', 0.001637544427696831),\n",
       " (u'features', 0.001628240197994008),\n",
       " (u'InfraWorks', 0.0015072852118573103),\n",
       " (u'the command', 0.0014979809821544874),\n",
       " (u'materials', 0.0014979809821544874),\n",
       " (u'an object', 0.0014793725227488417),\n",
       " (u'the installation', 0.0014607640633431957),\n",
       " (u'the right', 0.0014514598336403729),\n",
       " (u'the surface', 0.0014235471445319043),\n",
       " (u'They', 0.0014142429148290812),\n",
       " (u'items', 0.0013956344554234355),\n",
       " (u'the dialog box', 0.0013863302257206126),\n",
       " (u'failure', 0.0013863302257206126),\n",
       " (u'installation', 0.0013863302257206126),\n",
       " (u'drawings', 0.0013770259960177898),\n",
       " (u'AutoCAD Civil 3D', 0.0013770259960177898),\n",
       " (u'the ability', 0.001349113306909321),\n",
       " (u'file', 0.0013305048475036752),\n",
       " (u'Simulation CFD', 0.0013212006178008522),\n",
       " (u'Autodesk Vault', 0.0013212006178008522),\n",
       " (u'the product', 0.0013118963880980293),\n",
       " (u'geometry', 0.0013118963880980293),\n",
       " (u'the bottom', 0.0013118963880980293),\n",
       " (u'the left', 0.0013118963880980293),\n",
       " (u'elements', 0.0013025921583952064),\n",
       " (u'components', 0.0013025921583952064),\n",
       " (u'a project', 0.0012932879286923836),\n",
       " (u'your model', 0.0012839836989895607),\n",
       " (u'the device', 0.0012839836989895607),\n",
       " (u'the user', 0.0012746794692867379),\n",
       " (u'Service Pack', 0.0012653752395839148)]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "src_priors.items()[50:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dedupe\n",
    "\n",
    "no_dup_sources = []\n",
    "no_dup_refs = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "shuffle jointly, then split into train test\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The NVIDIA® iray® renderer now supports the NVIDIA® Kepler chipset.Die NVIDIA ® iray ® Renderer unterstützt jetzt die NVIDIA ® Kepler Chipsatz.Der NVIDIA® iray®-Renderer bietet nun Unterstützung für den Chipsatz NVIDIA® Kepler.3DSMAX2013MT0.000362012/11/13 00:13:54◊÷\n"
     ]
    }
   ],
   "source": [
    "print(autodesk_rows[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'The NVIDIA\\xae iray\\xae renderer now supports the NVIDIA\\xae Kepler chipset.\\uf8ffDie NVIDIA \\xae iray \\xae Renderer unterst\\xfctzt jetzt die NVIDIA \\xae Kepler Chipsatz.\\uf8ffDer NVIDIA\\xae iray\\xae-Renderer bietet nun Unterst\\xfctzung f\\xfcr den Chipsatz NVIDIA\\xae Kepler.\\uf8ff3DSMAX\\uf8ff2013\\uf8ffMT\\uf8ff0.000\\uf8ff36\\uf8ff\\uf8ff2012/11/13 00:13:54\\uf8ff\\u25ca\\xf7',\n",
       " u'{1}Home tab{2}Modify panel {3}> Draworder flyout{4}.\\uf8ff{1}Registerkarte Start{2}Gruppe \\xc4ndern {3}> Flyout Zeichnungsreihenfolge{4}.\\uf8ff{1}Registerkarte Start{2}Gruppe \\xc4ndern {3}> Flyout Zeichnungsreihenfolge{4}.\\uf8ffACD\\uf8ff2014\\uf8ffMT\\uf8ff0.074\\uf8ff70\\uf8ff<phs><ph id=\"1\">&lt;menucascade id=&quot;GUID-9206B026-680D-4738-8931-1F0F6D54CB36&quot;&gt; &lt;uicontrol id=&quot;GUID-3B7B95A6-B68E-40CD-9670-D0298139ABEF&quot;&gt;</ph><ph id=\"2\">&lt;/uicontrol&gt; &lt;uicontrol id=&quot;GUID-DDBA3BDC-A9DA-49FE-9C08-4FBDD5CC200E&quot;&gt;</ph><ph id=\"3\">&lt;/uicontrol&gt; &lt;/menucascade&gt; &lt;glyph type=&quot;Arrow&quot; id=&quot;GUID-0EDA2621-2439-4D3E-9EF7-B95261449F86&quot; /&gt;&lt;glyph type=&quot;panel_expander&quot; id=&quot;GUID-E13C2A2B-24EF-49F5-B84E-E4BDCE1724B6&quot; /&gt; &lt;menucascade id=&quot;GUID-0363DB41-CA1E-40F4-A8D7-2E1522146B7E&quot;&gt; &lt;uicontrol id=&quot;GUID-34F09A89-71DF-472C-A4F1-D217CB8427D1&quot;&gt;</ph><ph id=\"4\">&lt;/uicontrol&gt; &lt;/menucascade&gt;</ph></phs>\\uf8ff2012/11/19 19:18:17\\uf8ff\\u25ca\\xf7',\n",
       " u'From the drop-down list, choose one of the options.\\uf8ffaus der Dropdown-Liste auflisten, w\\xe4hlen Sie eine der Optionen aus.\\uf8ffW\\xe4hlen Sie in der Dropdown-Liste eine der Optionen aus.\\uf8ffACD\\uf8ff2014\\uf8ffMT\\uf8ff0.000\\uf8ff69\\uf8ff\\uf8ff2012/11/19 19:18:17\\uf8ff\\u25ca\\xf7']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "autodesk_rows[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
